Bug Summary

File:build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 45517, column 39
The result of the left shift is undefined due to shifting by '4294967291', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/X86 -I /build/source/llvm/lib/Target/X86 -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/ObjCARCUtil.h"
31#include "llvm/Analysis/ProfileSummaryInfo.h"
32#include "llvm/Analysis/VectorUtils.h"
33#include "llvm/CodeGen/IntrinsicLowering.h"
34#include "llvm/CodeGen/MachineFrameInfo.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineJumpTableInfo.h"
38#include "llvm/CodeGen/MachineLoopInfo.h"
39#include "llvm/CodeGen/MachineModuleInfo.h"
40#include "llvm/CodeGen/MachineRegisterInfo.h"
41#include "llvm/CodeGen/TargetLowering.h"
42#include "llvm/CodeGen/WinEHFuncInfo.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
45#include "llvm/IR/DerivedTypes.h"
46#include "llvm/IR/DiagnosticInfo.h"
47#include "llvm/IR/EHPersonalities.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107/// Returns true if a CC can dynamically exclude a register from the list of
108/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
109/// the return registers.
110static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
111 switch (CC) {
112 default:
113 return false;
114 case CallingConv::X86_RegCall:
115 case CallingConv::PreserveMost:
116 case CallingConv::PreserveAll:
117 return true;
118 }
119}
120
121/// Returns true if a CC can dynamically exclude a register from the list of
122/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
123/// the parameters.
124static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
125 return CC == CallingConv::X86_RegCall;
126}
127
128X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
129 const X86Subtarget &STI)
130 : TargetLowering(TM), Subtarget(STI) {
131 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
132 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
133
134 // Set up the TargetLowering object.
135
136 // X86 is weird. It always uses i8 for shift amounts and setcc results.
137 setBooleanContents(ZeroOrOneBooleanContent);
138 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
139 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
140
141 // For 64-bit, since we have so many registers, use the ILP scheduler.
142 // For 32-bit, use the register pressure specific scheduling.
143 // For Atom, always use ILP scheduling.
144 if (Subtarget.isAtom())
145 setSchedulingPreference(Sched::ILP);
146 else if (Subtarget.is64Bit())
147 setSchedulingPreference(Sched::ILP);
148 else
149 setSchedulingPreference(Sched::RegPressure);
150 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
151 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
152
153 // Bypass expensive divides and use cheaper ones.
154 if (TM.getOptLevel() >= CodeGenOpt::Default) {
155 if (Subtarget.hasSlowDivide32())
156 addBypassSlowDiv(32, 8);
157 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
158 addBypassSlowDiv(64, 32);
159 }
160
161 // Setup Windows compiler runtime calls.
162 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
163 static const struct {
164 const RTLIB::Libcall Op;
165 const char * const Name;
166 const CallingConv::ID CC;
167 } LibraryCalls[] = {
168 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
169 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
170 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
171 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
172 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
173 };
174
175 for (const auto &LC : LibraryCalls) {
176 setLibcallName(LC.Op, LC.Name);
177 setLibcallCallingConv(LC.Op, LC.CC);
178 }
179 }
180
181 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
182 // MSVCRT doesn't have powi; fall back to pow
183 setLibcallName(RTLIB::POWI_F32, nullptr);
184 setLibcallName(RTLIB::POWI_F64, nullptr);
185 }
186
187 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
188 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
189 // FIXME: Should we be limiting the atomic size on other configs? Default is
190 // 1024.
191 if (!Subtarget.canUseCMPXCHG8B())
192 setMaxAtomicSizeInBitsSupported(32);
193
194 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
195
196 setMaxLargeFPConvertBitWidthSupported(128);
197
198 // Set up the register classes.
199 addRegisterClass(MVT::i8, &X86::GR8RegClass);
200 addRegisterClass(MVT::i16, &X86::GR16RegClass);
201 addRegisterClass(MVT::i32, &X86::GR32RegClass);
202 if (Subtarget.is64Bit())
203 addRegisterClass(MVT::i64, &X86::GR64RegClass);
204
205 for (MVT VT : MVT::integer_valuetypes())
206 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
207
208 // We don't accept any truncstore of integer registers.
209 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
211 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
212 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
213 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
214 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
215
216 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
217
218 // SETOEQ and SETUNE require checking two conditions.
219 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
220 setCondCodeAction(ISD::SETOEQ, VT, Expand);
221 setCondCodeAction(ISD::SETUNE, VT, Expand);
222 }
223
224 // Integer absolute.
225 if (Subtarget.canUseCMOV()) {
226 setOperationAction(ISD::ABS , MVT::i16 , Custom);
227 setOperationAction(ISD::ABS , MVT::i32 , Custom);
228 if (Subtarget.is64Bit())
229 setOperationAction(ISD::ABS , MVT::i64 , Custom);
230 }
231
232 // Absolute difference.
233 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
234 setOperationAction(Op , MVT::i8 , Custom);
235 setOperationAction(Op , MVT::i16 , Custom);
236 setOperationAction(Op , MVT::i32 , Custom);
237 if (Subtarget.is64Bit())
238 setOperationAction(Op , MVT::i64 , Custom);
239 }
240
241 // Signed saturation subtraction.
242 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
243 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
244 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
245 if (Subtarget.is64Bit())
246 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
247
248 // Funnel shifts.
249 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
250 // For slow shld targets we only lower for code size.
251 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
252
253 setOperationAction(ShiftOp , MVT::i8 , Custom);
254 setOperationAction(ShiftOp , MVT::i16 , Custom);
255 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
256 if (Subtarget.is64Bit())
257 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
258 }
259
260 if (!Subtarget.useSoftFloat()) {
261 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
262 // operation.
263 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
264 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
265 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
266 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
267 // We have an algorithm for SSE2, and we turn this into a 64-bit
268 // FILD or VCVTUSI2SS/SD for other targets.
269 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
270 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
271 // We have an algorithm for SSE2->double, and we turn this into a
272 // 64-bit FILD followed by conditional FADD for other targets.
273 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
274 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
275
276 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
277 // this operation.
278 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
279 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
280 // SSE has no i16 to fp conversion, only i32. We promote in the handler
281 // to allow f80 to use i16 and f64 to use i16 with sse1 only
282 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
283 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
284 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
285 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
286 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
287 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
288 // are Legal, f80 is custom lowered.
289 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
290 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
291
292 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
293 // this operation.
294 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
295 // FIXME: This doesn't generate invalid exception when it should. PR44019.
296 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
297 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
298 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
299 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
300 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
301 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
302 // are Legal, f80 is custom lowered.
303 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
304 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
305
306 // Handle FP_TO_UINT by promoting the destination to a larger signed
307 // conversion.
308 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
309 // FIXME: This doesn't generate invalid exception when it should. PR44019.
310 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
311 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
312 // FIXME: This doesn't generate invalid exception when it should. PR44019.
313 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
314 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
315 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
316 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
317 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
318
319 setOperationAction(ISD::LRINT, MVT::f32, Custom);
320 setOperationAction(ISD::LRINT, MVT::f64, Custom);
321 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
322 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
323
324 if (!Subtarget.is64Bit()) {
325 setOperationAction(ISD::LRINT, MVT::i64, Custom);
326 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
327 }
328 }
329
330 if (Subtarget.hasSSE2()) {
331 // Custom lowering for saturating float to int conversions.
332 // We handle promotion to larger result types manually.
333 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
334 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
335 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
336 }
337 if (Subtarget.is64Bit()) {
338 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
339 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
340 }
341 }
342
343 // Handle address space casts between mixed sized pointers.
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
345 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
346
347 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
348 if (!Subtarget.hasSSE2()) {
349 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
350 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
351 if (Subtarget.is64Bit()) {
352 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
353 // Without SSE, i64->f64 goes through memory.
354 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
355 }
356 } else if (!Subtarget.is64Bit())
357 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
358
359 // Scalar integer divide and remainder are lowered to use operations that
360 // produce two results, to match the available instructions. This exposes
361 // the two-result form to trivial CSE, which is able to combine x/y and x%y
362 // into a single instruction.
363 //
364 // Scalar integer multiply-high is also lowered to use two-result
365 // operations, to match the available instructions. However, plain multiply
366 // (low) operations are left as Legal, as there are single-result
367 // instructions for this in x86. Using the two-result multiply instructions
368 // when both high and low results are needed must be arranged by dagcombine.
369 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
370 setOperationAction(ISD::MULHS, VT, Expand);
371 setOperationAction(ISD::MULHU, VT, Expand);
372 setOperationAction(ISD::SDIV, VT, Expand);
373 setOperationAction(ISD::UDIV, VT, Expand);
374 setOperationAction(ISD::SREM, VT, Expand);
375 setOperationAction(ISD::UREM, VT, Expand);
376 }
377
378 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
379 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
380 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
381 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
382 setOperationAction(ISD::BR_CC, VT, Expand);
383 setOperationAction(ISD::SELECT_CC, VT, Expand);
384 }
385 if (Subtarget.is64Bit())
386 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
388 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
389 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
390
391 setOperationAction(ISD::FREM , MVT::f32 , Expand);
392 setOperationAction(ISD::FREM , MVT::f64 , Expand);
393 setOperationAction(ISD::FREM , MVT::f80 , Expand);
394 setOperationAction(ISD::FREM , MVT::f128 , Expand);
395
396 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
397 setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);
398 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
399 }
400
401 // Promote the i8 variants and force them on up to i32 which has a shorter
402 // encoding.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
404 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
405 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
406 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
407 // promote that too.
408 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
409 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);
410
411 if (!Subtarget.hasBMI()) {
412 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
413 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
414 if (Subtarget.is64Bit()) {
415 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
416 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
417 }
418 }
419
420 if (Subtarget.hasLZCNT()) {
421 // When promoting the i8 variants, force them to i32 for a shorter
422 // encoding.
423 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
424 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
425 } else {
426 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
427 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 continue;
429 setOperationAction(ISD::CTLZ , VT, Custom);
430 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
431 }
432 }
433
434 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
435 ISD::STRICT_FP_TO_FP16}) {
436 // Special handling for half-precision floating point conversions.
437 // If we don't have F16C support, then lower half float conversions
438 // into library calls.
439 setOperationAction(
440 Op, MVT::f32,
441 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
442 // There's never any support for operations beyond MVT::f32.
443 setOperationAction(Op, MVT::f64, Expand);
444 setOperationAction(Op, MVT::f80, Expand);
445 setOperationAction(Op, MVT::f128, Expand);
446 }
447
448 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
449 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
450 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
451 setTruncStoreAction(VT, MVT::f16, Expand);
452 setTruncStoreAction(VT, MVT::bf16, Expand);
453
454 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
455 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
456 }
457
458 setOperationAction(ISD::PARITY, MVT::i8, Custom);
459 setOperationAction(ISD::PARITY, MVT::i16, Custom);
460 setOperationAction(ISD::PARITY, MVT::i32, Custom);
461 if (Subtarget.is64Bit())
462 setOperationAction(ISD::PARITY, MVT::i64, Custom);
463 if (Subtarget.hasPOPCNT()) {
464 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
465 // popcntw is longer to encode than popcntl and also has a false dependency
466 // on the dest that popcntl hasn't had since Cannon Lake.
467 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
468 } else {
469 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
470 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
471 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
472 if (Subtarget.is64Bit())
473 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
474 else
475 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
476 }
477
478 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
479
480 if (!Subtarget.hasMOVBE())
481 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
482
483 // X86 wants to expand cmov itself.
484 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
485 setOperationAction(ISD::SELECT, VT, Custom);
486 setOperationAction(ISD::SETCC, VT, Custom);
487 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
488 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
489 }
490 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
491 if (VT == MVT::i64 && !Subtarget.is64Bit())
492 continue;
493 setOperationAction(ISD::SELECT, VT, Custom);
494 setOperationAction(ISD::SETCC, VT, Custom);
495 }
496
497 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
498 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
499 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
500
501 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
502 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
503 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
504 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
505 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
506 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
507 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
508 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
509
510 // Darwin ABI issue.
511 for (auto VT : { MVT::i32, MVT::i64 }) {
512 if (VT == MVT::i64 && !Subtarget.is64Bit())
513 continue;
514 setOperationAction(ISD::ConstantPool , VT, Custom);
515 setOperationAction(ISD::JumpTable , VT, Custom);
516 setOperationAction(ISD::GlobalAddress , VT, Custom);
517 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
518 setOperationAction(ISD::ExternalSymbol , VT, Custom);
519 setOperationAction(ISD::BlockAddress , VT, Custom);
520 }
521
522 // 64-bit shl, sra, srl (iff 32-bit x86)
523 for (auto VT : { MVT::i32, MVT::i64 }) {
524 if (VT == MVT::i64 && !Subtarget.is64Bit())
525 continue;
526 setOperationAction(ISD::SHL_PARTS, VT, Custom);
527 setOperationAction(ISD::SRA_PARTS, VT, Custom);
528 setOperationAction(ISD::SRL_PARTS, VT, Custom);
529 }
530
531 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
532 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
533
534 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
535
536 // Expand certain atomics
537 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
538 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
539 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
540 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
541 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
542 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
544 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
545 }
546
547 if (!Subtarget.is64Bit())
548 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
549
550 if (Subtarget.canUseCMPXCHG16B())
551 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
552
553 // FIXME - use subtarget debug flags
554 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
555 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
556 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
557 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
558 }
559
560 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
561 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
562
563 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
564 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
565
566 setOperationAction(ISD::TRAP, MVT::Other, Legal);
567 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
568 if (Subtarget.isTargetPS())
569 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
570 else
571 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
572
573 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
574 setOperationAction(ISD::VASTART , MVT::Other, Custom);
575 setOperationAction(ISD::VAEND , MVT::Other, Expand);
576 bool Is64Bit = Subtarget.is64Bit();
577 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
578 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
579
580 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
581 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
582
583 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
584
585 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
586 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
587 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
588
589 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
590
591 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
592 setOperationAction(ISD::FABS, VT, Action);
593 setOperationAction(ISD::FNEG, VT, Action);
594 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
595 setOperationAction(ISD::FREM, VT, Action);
596 setOperationAction(ISD::FMA, VT, Action);
597 setOperationAction(ISD::FMINNUM, VT, Action);
598 setOperationAction(ISD::FMAXNUM, VT, Action);
599 setOperationAction(ISD::FMINIMUM, VT, Action);
600 setOperationAction(ISD::FMAXIMUM, VT, Action);
601 setOperationAction(ISD::FSIN, VT, Action);
602 setOperationAction(ISD::FCOS, VT, Action);
603 setOperationAction(ISD::FSINCOS, VT, Action);
604 setOperationAction(ISD::FSQRT, VT, Action);
605 setOperationAction(ISD::FPOW, VT, Action);
606 setOperationAction(ISD::FLOG, VT, Action);
607 setOperationAction(ISD::FLOG2, VT, Action);
608 setOperationAction(ISD::FLOG10, VT, Action);
609 setOperationAction(ISD::FEXP, VT, Action);
610 setOperationAction(ISD::FEXP2, VT, Action);
611 setOperationAction(ISD::FCEIL, VT, Action);
612 setOperationAction(ISD::FFLOOR, VT, Action);
613 setOperationAction(ISD::FNEARBYINT, VT, Action);
614 setOperationAction(ISD::FRINT, VT, Action);
615 setOperationAction(ISD::BR_CC, VT, Action);
616 setOperationAction(ISD::SETCC, VT, Action);
617 setOperationAction(ISD::SELECT, VT, Custom);
618 setOperationAction(ISD::SELECT_CC, VT, Action);
619 setOperationAction(ISD::FROUND, VT, Action);
620 setOperationAction(ISD::FROUNDEVEN, VT, Action);
621 setOperationAction(ISD::FTRUNC, VT, Action);
622 };
623
624 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
625 // f16, f32 and f64 use SSE.
626 // Set up the FP register classes.
627 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
628 : &X86::FR16RegClass);
629 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
630 : &X86::FR32RegClass);
631 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
632 : &X86::FR64RegClass);
633
634 // Disable f32->f64 extload as we can only generate this in one instruction
635 // under optsize. So its easier to pattern match (fpext (load)) for that
636 // case instead of needing to emit 2 instructions for extload in the
637 // non-optsize case.
638 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
639
640 for (auto VT : { MVT::f32, MVT::f64 }) {
641 // Use ANDPD to simulate FABS.
642 setOperationAction(ISD::FABS, VT, Custom);
643
644 // Use XORP to simulate FNEG.
645 setOperationAction(ISD::FNEG, VT, Custom);
646
647 // Use ANDPD and ORPD to simulate FCOPYSIGN.
648 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
649
650 // These might be better off as horizontal vector ops.
651 setOperationAction(ISD::FADD, VT, Custom);
652 setOperationAction(ISD::FSUB, VT, Custom);
653
654 // We don't support sin/cos/fmod
655 setOperationAction(ISD::FSIN , VT, Expand);
656 setOperationAction(ISD::FCOS , VT, Expand);
657 setOperationAction(ISD::FSINCOS, VT, Expand);
658 }
659
660 // Half type will be promoted by default.
661 setF16Action(MVT::f16, Promote);
662 setOperationAction(ISD::FADD, MVT::f16, Promote);
663 setOperationAction(ISD::FSUB, MVT::f16, Promote);
664 setOperationAction(ISD::FMUL, MVT::f16, Promote);
665 setOperationAction(ISD::FDIV, MVT::f16, Promote);
666 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
667 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
668 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
669
670 setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
671 setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
672 setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
673 setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
674 setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
675 setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
676 setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
677 setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
678 setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
679 setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
680 setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
681 setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
682 setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
683 setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
684 setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
685 setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
686 setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
687 setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
688 setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
689 setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
690 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
691 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
692 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
693 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
694 setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
695 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
696 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
697 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
698
699 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
700 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
701
702 // Lower this to MOVMSK plus an AND.
703 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
704 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
705
706 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
707 (UseX87 || Is64Bit)) {
708 // Use SSE for f32, x87 for f64.
709 // Set up the FP register classes.
710 addRegisterClass(MVT::f32, &X86::FR32RegClass);
711 if (UseX87)
712 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
713
714 // Use ANDPS to simulate FABS.
715 setOperationAction(ISD::FABS , MVT::f32, Custom);
716
717 // Use XORP to simulate FNEG.
718 setOperationAction(ISD::FNEG , MVT::f32, Custom);
719
720 if (UseX87)
721 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
722
723 // Use ANDPS and ORPS to simulate FCOPYSIGN.
724 if (UseX87)
725 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
726 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
727
728 // We don't support sin/cos/fmod
729 setOperationAction(ISD::FSIN , MVT::f32, Expand);
730 setOperationAction(ISD::FCOS , MVT::f32, Expand);
731 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
732
733 if (UseX87) {
734 // Always expand sin/cos functions even though x87 has an instruction.
735 setOperationAction(ISD::FSIN, MVT::f64, Expand);
736 setOperationAction(ISD::FCOS, MVT::f64, Expand);
737 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
738 }
739 } else if (UseX87) {
740 // f32 and f64 in x87.
741 // Set up the FP register classes.
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
744
745 for (auto VT : { MVT::f32, MVT::f64 }) {
746 setOperationAction(ISD::UNDEF, VT, Expand);
747 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
748
749 // Always expand sin/cos functions even though x87 has an instruction.
750 setOperationAction(ISD::FSIN , VT, Expand);
751 setOperationAction(ISD::FCOS , VT, Expand);
752 setOperationAction(ISD::FSINCOS, VT, Expand);
753 }
754 }
755
756 // Expand FP32 immediates into loads from the stack, save special cases.
757 if (isTypeLegal(MVT::f32)) {
758 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
759 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
760 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
761 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
762 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
763 } else // SSE immediates.
764 addLegalFPImmediate(APFloat(+0.0f)); // xorps
765 }
766 // Expand FP64 immediates into loads from the stack, save special cases.
767 if (isTypeLegal(MVT::f64)) {
768 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
769 addLegalFPImmediate(APFloat(+0.0)); // FLD0
770 addLegalFPImmediate(APFloat(+1.0)); // FLD1
771 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
772 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
773 } else // SSE immediates.
774 addLegalFPImmediate(APFloat(+0.0)); // xorpd
775 }
776 // Support fp16 0 immediate.
777 if (isTypeLegal(MVT::f16))
778 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
779
780 // Handle constrained floating-point operations of scalar.
781 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
782 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
783 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
784 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
785 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
786 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
787 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
788 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
789 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
790 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
791 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
792 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
793
794 // We don't support FMA.
795 setOperationAction(ISD::FMA, MVT::f64, Expand);
796 setOperationAction(ISD::FMA, MVT::f32, Expand);
797
798 // f80 always uses X87.
799 if (UseX87) {
800 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
801 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
802 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
803 {
804 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
805 addLegalFPImmediate(TmpFlt); // FLD0
806 TmpFlt.changeSign();
807 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
808
809 bool ignored;
810 APFloat TmpFlt2(+1.0);
811 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
812 &ignored);
813 addLegalFPImmediate(TmpFlt2); // FLD1
814 TmpFlt2.changeSign();
815 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
816 }
817
818 // Always expand sin/cos functions even though x87 has an instruction.
819 setOperationAction(ISD::FSIN , MVT::f80, Expand);
820 setOperationAction(ISD::FCOS , MVT::f80, Expand);
821 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
822
823 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
824 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
825 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
826 setOperationAction(ISD::FRINT, MVT::f80, Expand);
827 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
828 setOperationAction(ISD::FMA, MVT::f80, Expand);
829 setOperationAction(ISD::LROUND, MVT::f80, Expand);
830 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
831 setOperationAction(ISD::LRINT, MVT::f80, Custom);
832 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
833
834 // Handle constrained floating-point operations of scalar.
835 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
836 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
837 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
838 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
839 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
840 if (isTypeLegal(MVT::f16)) {
841 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
842 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
843 } else {
844 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
845 }
846 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
847 // as Custom.
848 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
849 }
850
851 // f128 uses xmm registers, but most operations require libcalls.
852 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
853 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
854 : &X86::VR128RegClass);
855
856 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
857
858 setOperationAction(ISD::FADD, MVT::f128, LibCall);
859 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
860 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
861 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
862 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
863 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
864 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
865 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
866 setOperationAction(ISD::FMA, MVT::f128, LibCall);
867 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
868
869 setOperationAction(ISD::FABS, MVT::f128, Custom);
870 setOperationAction(ISD::FNEG, MVT::f128, Custom);
871 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
872
873 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
874 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
875 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
876 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
877 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
878 // No STRICT_FSINCOS
879 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
880 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
881
882 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
883 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
884 // We need to custom handle any FP_ROUND with an f128 input, but
885 // LegalizeDAG uses the result type to know when to run a custom handler.
886 // So we have to list all legal floating point result types here.
887 if (isTypeLegal(MVT::f32)) {
888 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
889 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
890 }
891 if (isTypeLegal(MVT::f64)) {
892 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
893 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
894 }
895 if (isTypeLegal(MVT::f80)) {
896 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
897 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
898 }
899
900 setOperationAction(ISD::SETCC, MVT::f128, Custom);
901
902 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
903 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
904 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
905 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
906 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
907 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
908 }
909
910 // Always use a library call for pow.
911 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
912 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
913 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
914 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
915
916 setOperationAction(ISD::FLOG, MVT::f80, Expand);
917 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
918 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
919 setOperationAction(ISD::FEXP, MVT::f80, Expand);
920 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
921 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
922 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
923
924 // Some FP actions are always expanded for vector types.
925 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
926 MVT::v4f32, MVT::v8f32, MVT::v16f32,
927 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
928 setOperationAction(ISD::FSIN, VT, Expand);
929 setOperationAction(ISD::FSINCOS, VT, Expand);
930 setOperationAction(ISD::FCOS, VT, Expand);
931 setOperationAction(ISD::FREM, VT, Expand);
932 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
933 setOperationAction(ISD::FPOW, VT, Expand);
934 setOperationAction(ISD::FLOG, VT, Expand);
935 setOperationAction(ISD::FLOG2, VT, Expand);
936 setOperationAction(ISD::FLOG10, VT, Expand);
937 setOperationAction(ISD::FEXP, VT, Expand);
938 setOperationAction(ISD::FEXP2, VT, Expand);
939 }
940
941 // First set operation action for all vector types to either promote
942 // (for widening) or expand (for scalarization). Then we will selectively
943 // turn on ones that can be effectively codegen'd.
944 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
945 setOperationAction(ISD::SDIV, VT, Expand);
946 setOperationAction(ISD::UDIV, VT, Expand);
947 setOperationAction(ISD::SREM, VT, Expand);
948 setOperationAction(ISD::UREM, VT, Expand);
949 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
950 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
951 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
952 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
953 setOperationAction(ISD::FMA, VT, Expand);
954 setOperationAction(ISD::FFLOOR, VT, Expand);
955 setOperationAction(ISD::FCEIL, VT, Expand);
956 setOperationAction(ISD::FTRUNC, VT, Expand);
957 setOperationAction(ISD::FRINT, VT, Expand);
958 setOperationAction(ISD::FNEARBYINT, VT, Expand);
959 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
960 setOperationAction(ISD::MULHS, VT, Expand);
961 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
962 setOperationAction(ISD::MULHU, VT, Expand);
963 setOperationAction(ISD::SDIVREM, VT, Expand);
964 setOperationAction(ISD::UDIVREM, VT, Expand);
965 setOperationAction(ISD::CTPOP, VT, Expand);
966 setOperationAction(ISD::CTTZ, VT, Expand);
967 setOperationAction(ISD::CTLZ, VT, Expand);
968 setOperationAction(ISD::ROTL, VT, Expand);
969 setOperationAction(ISD::ROTR, VT, Expand);
970 setOperationAction(ISD::BSWAP, VT, Expand);
971 setOperationAction(ISD::SETCC, VT, Expand);
972 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
973 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
974 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
975 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
976 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
977 setOperationAction(ISD::TRUNCATE, VT, Expand);
978 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
979 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
980 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
981 setOperationAction(ISD::SELECT_CC, VT, Expand);
982 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
983 setTruncStoreAction(InnerVT, VT, Expand);
984
985 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
986 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
987
988 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
989 // types, we have to deal with them whether we ask for Expansion or not.
990 // Setting Expand causes its own optimisation problems though, so leave
991 // them legal.
992 if (VT.getVectorElementType() == MVT::i1)
993 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
994
995 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
996 // split/scalarized right now.
997 if (VT.getVectorElementType() == MVT::f16 ||
998 VT.getVectorElementType() == MVT::bf16)
999 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1000 }
1001 }
1002
1003 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1004 // with -msoft-float, disable use of MMX as well.
1005 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1006 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1007 // No operations on x86mmx supported, everything uses intrinsics.
1008 }
1009
1010 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1011 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1012 : &X86::VR128RegClass);
1013
1014 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1015 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1016
1017 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1018 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1019 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
1020 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
1021 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
1022 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1023 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1024 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
1025
1026 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1027 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1028
1029 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1030 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1031 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1032 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1033 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1034 }
1035
1036 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1037 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1038 : &X86::VR128RegClass);
1039
1040 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1041 // registers cannot be used even for integer operations.
1042 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1043 : &X86::VR128RegClass);
1044 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1045 : &X86::VR128RegClass);
1046 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1047 : &X86::VR128RegClass);
1048 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1049 : &X86::VR128RegClass);
1050 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1051 : &X86::VR128RegClass);
1052
1053 setOperationAction(ISD::FMAXIMUM, MVT::f64, Custom);
1054 setOperationAction(ISD::FMINIMUM, MVT::f64, Custom);
1055
1056 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1057 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1058 setOperationAction(ISD::SDIV, VT, Custom);
1059 setOperationAction(ISD::SREM, VT, Custom);
1060 setOperationAction(ISD::UDIV, VT, Custom);
1061 setOperationAction(ISD::UREM, VT, Custom);
1062 }
1063
1064 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1065 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1066 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1067
1068 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1069 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1070 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1071 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1072 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1073 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1074 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1075 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1076 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1077 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1078 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
1079 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
1080
1081 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1082 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1083 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1084
1085 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1086 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1087 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
1088
1089 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1090 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1091 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1092 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1093 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1094 }
1095
1096 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1097 setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
1098 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1099
1100 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
1101 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
1102 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
1103 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
1104 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
1105 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
1106 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
1107 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
1108 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
1109 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
1110
1111 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1112 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1113 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1114 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1115
1116 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1117 setOperationAction(ISD::SETCC, VT, Custom);
1118 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1119 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1120 setOperationAction(ISD::CTPOP, VT, Custom);
1121 setOperationAction(ISD::ABS, VT, Custom);
1122
1123 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1124 // setcc all the way to isel and prefer SETGT in some isel patterns.
1125 setCondCodeAction(ISD::SETLT, VT, Custom);
1126 setCondCodeAction(ISD::SETLE, VT, Custom);
1127 }
1128
1129 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1130 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1131 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1132 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1133 setOperationAction(ISD::VSELECT, VT, Custom);
1134 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1135 }
1136
1137 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1138 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1139 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1140 setOperationAction(ISD::VSELECT, VT, Custom);
1141
1142 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1143 continue;
1144
1145 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1146 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1147 }
1148 setF16Action(MVT::v8f16, Expand);
1149 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1150 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1151 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1152 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1153
1154 // Custom lower v2i64 and v2f64 selects.
1155 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1156 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1157 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1158 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1159 setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
1160 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1161
1162 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
1163 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1164 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1165 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1166 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
1167 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1168
1169 // Custom legalize these to avoid over promotion or custom promotion.
1170 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1171 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1172 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1173 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1174 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1175 }
1176
1177 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1178 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
1179 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1180 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1181
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1183 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1184
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1186 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1187
1188 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1189 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1190 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1191 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1192 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1193
1194 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1195 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1196 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1197 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1198
1199 // We want to legalize this to an f64 load rather than an i64 load on
1200 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1201 // store.
1202 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1203 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1204 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1205 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1206 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1207 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1208
1209 // Add 32-bit vector stores to help vectorization opportunities.
1210 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1211 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1212
1213 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1214 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1215 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1216 if (!Subtarget.hasAVX512())
1217 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1218
1219 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1220 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1221 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1222
1223 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1224
1225 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1226 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1227 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1228 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1229 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1230 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1231
1232 // In the customized shift lowering, the legal v4i32/v2i64 cases
1233 // in AVX2 will be recognized.
1234 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1235 setOperationAction(ISD::SRL, VT, Custom);
1236 setOperationAction(ISD::SHL, VT, Custom);
1237 setOperationAction(ISD::SRA, VT, Custom);
1238 if (VT == MVT::v2i64) continue;
1239 setOperationAction(ISD::ROTL, VT, Custom);
1240 setOperationAction(ISD::ROTR, VT, Custom);
1241 setOperationAction(ISD::FSHL, VT, Custom);
1242 setOperationAction(ISD::FSHR, VT, Custom);
1243 }
1244
1245 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1246 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1247 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1248 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1249 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1250 }
1251
1252 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1253 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1254 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1255 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1256 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1257 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1258 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1259 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1260 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1261
1262 // These might be better off as horizontal vector ops.
1263 setOperationAction(ISD::ADD, MVT::i16, Custom);
1264 setOperationAction(ISD::ADD, MVT::i32, Custom);
1265 setOperationAction(ISD::SUB, MVT::i16, Custom);
1266 setOperationAction(ISD::SUB, MVT::i32, Custom);
1267 }
1268
1269 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1270 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1271 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1272 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1273 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1274 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1275 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1276 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1277 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1278 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1279 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1280 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1281 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1282 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1283
1284 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1285 }
1286
1287 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1288 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1289 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1290 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1291 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1292 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1293 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1294 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1295
1296 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1297 setOperationAction(ISD::ABDS, VT, Custom);
1298 setOperationAction(ISD::ABDU, VT, Custom);
1299 }
1300
1301 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1302 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1303 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1304
1305 // FIXME: Do we need to handle scalar-to-vector here?
1306 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1307 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1308
1309 // We directly match byte blends in the backend as they match the VSELECT
1310 // condition form.
1311 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1312
1313 // SSE41 brings specific instructions for doing vector sign extend even in
1314 // cases where we don't have SRA.
1315 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1316 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1317 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1318 }
1319
1320 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1321 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1322 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1323 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1324 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1325 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1326 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1327 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1328 }
1329
1330 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1331 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1332 // do the pre and post work in the vector domain.
1333 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1334 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1335 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1336 // so that DAG combine doesn't try to turn it into uint_to_fp.
1337 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1338 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1339 }
1340 }
1341
1342 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1343 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1344 }
1345
1346 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1347 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1348 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1349 setOperationAction(ISD::ROTL, VT, Custom);
1350 setOperationAction(ISD::ROTR, VT, Custom);
1351 }
1352
1353 // XOP can efficiently perform BITREVERSE with VPPERM.
1354 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1355 setOperationAction(ISD::BITREVERSE, VT, Custom);
1356
1357 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1358 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1359 setOperationAction(ISD::BITREVERSE, VT, Custom);
1360 }
1361
1362 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1363 bool HasInt256 = Subtarget.hasInt256();
1364
1365 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1366 : &X86::VR256RegClass);
1367 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1368 : &X86::VR256RegClass);
1369 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1370 : &X86::VR256RegClass);
1371 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1372 : &X86::VR256RegClass);
1373 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1374 : &X86::VR256RegClass);
1375 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1376 : &X86::VR256RegClass);
1377 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1378 : &X86::VR256RegClass);
1379
1380 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1381 setOperationAction(ISD::FFLOOR, VT, Legal);
1382 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1383 setOperationAction(ISD::FCEIL, VT, Legal);
1384 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1385 setOperationAction(ISD::FTRUNC, VT, Legal);
1386 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1387 setOperationAction(ISD::FRINT, VT, Legal);
1388 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1389 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1390 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1391 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1392 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1393
1394 setOperationAction(ISD::FROUND, VT, Custom);
1395
1396 setOperationAction(ISD::FNEG, VT, Custom);
1397 setOperationAction(ISD::FABS, VT, Custom);
1398 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1399 }
1400
1401 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1402 // even though v8i16 is a legal type.
1403 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1404 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1405 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1406 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1407 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
1408 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1409 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
1410
1411 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
1412 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
1413 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1414 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
1415 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1416 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
1417
1418 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1419 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1420 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1421 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1422 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1423 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1424 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1425 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1426 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1427 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1428 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1429
1430 if (!Subtarget.hasAVX512())
1431 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1432
1433 // In the customized shift lowering, the legal v8i32/v4i64 cases
1434 // in AVX2 will be recognized.
1435 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1436 setOperationAction(ISD::SRL, VT, Custom);
1437 setOperationAction(ISD::SHL, VT, Custom);
1438 setOperationAction(ISD::SRA, VT, Custom);
1439 setOperationAction(ISD::ABDS, VT, Custom);
1440 setOperationAction(ISD::ABDU, VT, Custom);
1441 if (VT == MVT::v4i64) continue;
1442 setOperationAction(ISD::ROTL, VT, Custom);
1443 setOperationAction(ISD::ROTR, VT, Custom);
1444 setOperationAction(ISD::FSHL, VT, Custom);
1445 setOperationAction(ISD::FSHR, VT, Custom);
1446 }
1447
1448 // These types need custom splitting if their input is a 128-bit vector.
1449 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1450 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1451 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1452 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1453
1454 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1455 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1456 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1457 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1458 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1459 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1460 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1461
1462 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1463 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1464 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1465 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1466 }
1467
1468 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1469 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1470 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1471 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1472
1473 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1474 setOperationAction(ISD::SETCC, VT, Custom);
1475 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1476 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1477 setOperationAction(ISD::CTPOP, VT, Custom);
1478 setOperationAction(ISD::CTLZ, VT, Custom);
1479
1480 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1481 // setcc all the way to isel and prefer SETGT in some isel patterns.
1482 setCondCodeAction(ISD::SETLT, VT, Custom);
1483 setCondCodeAction(ISD::SETLE, VT, Custom);
1484 }
1485
1486 if (Subtarget.hasAnyFMA()) {
1487 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1488 MVT::v2f64, MVT::v4f64 }) {
1489 setOperationAction(ISD::FMA, VT, Legal);
1490 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1491 }
1492 }
1493
1494 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1495 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1496 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1497 }
1498
1499 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1500 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1501 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1502 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1503
1504 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1505 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1506 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1507 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1508 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1509 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1510 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1511 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1512
1513 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1514 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1515
1516 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1517 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1518 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1519 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1520 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1521
1522 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1523 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1524 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1525 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1526 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1527 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1528 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1529 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1530 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1531 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1532 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1533 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1534
1535 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1536 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1537 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1538 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1539 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1540 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1541 }
1542
1543 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1544 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1545 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1546 }
1547
1548 if (HasInt256) {
1549 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1550 // when we have a 256bit-wide blend with immediate.
1551 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1552 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1553
1554 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1555 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1556 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1557 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1558 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1559 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1560 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1561 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1562 }
1563 }
1564
1565 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1566 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1567 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1568 setOperationAction(ISD::MSTORE, VT, Legal);
1569 }
1570
1571 // Extract subvector is special because the value type
1572 // (result) is 128-bit but the source is 256-bit wide.
1573 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1574 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1575 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1576 }
1577
1578 // Custom lower several nodes for 256-bit types.
1579 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1580 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1581 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1582 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1583 setOperationAction(ISD::VSELECT, VT, Custom);
1584 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1585 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1586 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1587 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1588 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1589 setOperationAction(ISD::STORE, VT, Custom);
1590 }
1591 setF16Action(MVT::v16f16, Expand);
1592 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1593 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1594 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1595 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1596
1597 if (HasInt256) {
1598 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1599
1600 // Custom legalize 2x32 to get a little better code.
1601 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1602 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1603
1604 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1605 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1606 setOperationAction(ISD::MGATHER, VT, Custom);
1607 }
1608 }
1609
1610 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1611 Subtarget.hasF16C()) {
1612 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1613 setOperationAction(ISD::FP_ROUND, VT, Custom);
1614 setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1615 }
1616 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
1617 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1618 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
1619 }
1620 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1621 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1622 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1623 }
1624
1625 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1626 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
1627 }
1628
1629 // This block controls legalization of the mask vector sizes that are
1630 // available with AVX512. 512-bit vectors are in a separate block controlled
1631 // by useAVX512Regs.
1632 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1633 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1634 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1635 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1636 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1637 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1638
1639 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1640 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1641 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1642
1643 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1644 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1645 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1646 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1647 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1648 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1649 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1650 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1651 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1652 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1653 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1654 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1655
1656 // There is no byte sized k-register load or store without AVX512DQ.
1657 if (!Subtarget.hasDQI()) {
1658 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1659 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1660 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1661 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1662
1663 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1664 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1665 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1666 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1667 }
1668
1669 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1670 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1671 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1672 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1673 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1674 }
1675
1676 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1677 setOperationAction(ISD::VSELECT, VT, Expand);
1678
1679 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1680 setOperationAction(ISD::SETCC, VT, Custom);
1681 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1682 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1683 setOperationAction(ISD::SELECT, VT, Custom);
1684 setOperationAction(ISD::TRUNCATE, VT, Custom);
1685
1686 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1687 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1688 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1689 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1691 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1692 }
1693
1694 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1695 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1696 }
1697
1698 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1699 // elements. 512-bits can be disabled based on prefer-vector-width and
1700 // required-vector-width function attributes.
1701 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1702 bool HasBWI = Subtarget.hasBWI();
1703
1704 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1705 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1706 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1707 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1708 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1709 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1710 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1711
1712 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1713 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1714 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1715 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1716 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1717 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1718 if (HasBWI)
1719 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1720 }
1721
1722 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1723 setOperationAction(ISD::FNEG, VT, Custom);
1724 setOperationAction(ISD::FABS, VT, Custom);
1725 setOperationAction(ISD::FMA, VT, Legal);
1726 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1727 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1728 }
1729
1730 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1731 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1732 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1733 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1734 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1735 }
1736
1737 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1738 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1739 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1740 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1741 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1742 }
1743
1744 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
1745 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
1746 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1747 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1748 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1749 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
1750
1751 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1752 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1753 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1754 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1755 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1756 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1757 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1758 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1759 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1760 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1761 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1762
1763 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1764 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1765 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1766 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1767 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1768 if (HasBWI)
1769 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1770
1771 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1772 // to 512-bit rather than use the AVX2 instructions so that we can use
1773 // k-masks.
1774 if (!Subtarget.hasVLX()) {
1775 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1776 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1777 setOperationAction(ISD::MLOAD, VT, Custom);
1778 setOperationAction(ISD::MSTORE, VT, Custom);
1779 }
1780 }
1781
1782 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1783 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1784 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1785 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1786 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1787 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1788 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1789 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1790 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1791 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1792 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1793 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1794 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1795
1796 if (HasBWI) {
1797 // Extends from v64i1 masks to 512-bit vectors.
1798 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1799 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1800 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1801 }
1802
1803 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1804 setOperationAction(ISD::FFLOOR, VT, Legal);
1805 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1806 setOperationAction(ISD::FCEIL, VT, Legal);
1807 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1808 setOperationAction(ISD::FTRUNC, VT, Legal);
1809 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1810 setOperationAction(ISD::FRINT, VT, Legal);
1811 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1812 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1813 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1814 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1815 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1816
1817 setOperationAction(ISD::FROUND, VT, Custom);
1818 }
1819
1820 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1821 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1822 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1823 }
1824
1825 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1826 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1827 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1828 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1829
1830 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1831 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1832 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1833 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1834
1835 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1836 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1837 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1838 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1839 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1840 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1841 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1842 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1843
1844 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1845 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1846
1847 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1848
1849 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1850 setOperationAction(ISD::SRL, VT, Custom);
1851 setOperationAction(ISD::SHL, VT, Custom);
1852 setOperationAction(ISD::SRA, VT, Custom);
1853 setOperationAction(ISD::ROTL, VT, Custom);
1854 setOperationAction(ISD::ROTR, VT, Custom);
1855 setOperationAction(ISD::SETCC, VT, Custom);
1856 setOperationAction(ISD::ABDS, VT, Custom);
1857 setOperationAction(ISD::ABDU, VT, Custom);
1858
1859 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1860 // setcc all the way to isel and prefer SETGT in some isel patterns.
1861 setCondCodeAction(ISD::SETLT, VT, Custom);
1862 setCondCodeAction(ISD::SETLE, VT, Custom);
1863 }
1864 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1865 setOperationAction(ISD::SMAX, VT, Legal);
1866 setOperationAction(ISD::UMAX, VT, Legal);
1867 setOperationAction(ISD::SMIN, VT, Legal);
1868 setOperationAction(ISD::UMIN, VT, Legal);
1869 setOperationAction(ISD::ABS, VT, Legal);
1870 setOperationAction(ISD::CTPOP, VT, Custom);
1871 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1872 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1873 }
1874
1875 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1876 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1877 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1878 setOperationAction(ISD::CTLZ, VT, Custom);
1879 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1880 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1881 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1882 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1883 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1884 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1885 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1886 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1887 }
1888
1889 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1890 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1891 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1892 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1893 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1894 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1895
1896 if (Subtarget.hasDQI()) {
1897 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1898 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1899 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1900 setOperationAction(Opc, MVT::v8i64, Custom);
1901 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1902 }
1903
1904 if (Subtarget.hasCDI()) {
1905 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1906 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1907 setOperationAction(ISD::CTLZ, VT, Legal);
1908 }
1909 } // Subtarget.hasCDI()
1910
1911 if (Subtarget.hasVPOPCNTDQ()) {
1912 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1913 setOperationAction(ISD::CTPOP, VT, Legal);
1914 }
1915
1916 // Extract subvector is special because the value type
1917 // (result) is 256-bit but the source is 512-bit wide.
1918 // 128-bit was made Legal under AVX1.
1919 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1920 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1921 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1922
1923 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1924 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1925 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1926 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1927 setOperationAction(ISD::SELECT, VT, Custom);
1928 setOperationAction(ISD::VSELECT, VT, Custom);
1929 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1930 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1931 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1932 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1933 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1934 }
1935 setF16Action(MVT::v32f16, Expand);
1936 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1937 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1938 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
1939 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1940 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1941 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1942 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1943 }
1944
1945 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1946 setOperationAction(ISD::MLOAD, VT, Legal);
1947 setOperationAction(ISD::MSTORE, VT, Legal);
1948 setOperationAction(ISD::MGATHER, VT, Custom);
1949 setOperationAction(ISD::MSCATTER, VT, Custom);
1950 }
1951 if (HasBWI) {
1952 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1953 setOperationAction(ISD::MLOAD, VT, Legal);
1954 setOperationAction(ISD::MSTORE, VT, Legal);
1955 }
1956 } else {
1957 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1958 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1959 }
1960
1961 if (Subtarget.hasVBMI2()) {
1962 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1963 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1964 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965 setOperationAction(ISD::FSHL, VT, Custom);
1966 setOperationAction(ISD::FSHR, VT, Custom);
1967 }
1968
1969 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1970 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1971 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1972 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1973 }
1974 }// useAVX512Regs
1975
1976 // This block controls legalization for operations that don't have
1977 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1978 // narrower widths.
1979 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1980 // These operations are handled on non-VLX by artificially widening in
1981 // isel patterns.
1982
1983 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
1984 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
1985 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1986
1987 if (Subtarget.hasDQI()) {
1988 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1989 // v2f32 UINT_TO_FP is already custom under SSE2.
1990 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))
1991 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))
1992 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))
;
1993 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1994 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1995 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1996 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1997 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1998 }
1999
2000 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2001 setOperationAction(ISD::SMAX, VT, Legal);
2002 setOperationAction(ISD::UMAX, VT, Legal);
2003 setOperationAction(ISD::SMIN, VT, Legal);
2004 setOperationAction(ISD::UMIN, VT, Legal);
2005 setOperationAction(ISD::ABS, VT, Legal);
2006 }
2007
2008 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2009 setOperationAction(ISD::ROTL, VT, Custom);
2010 setOperationAction(ISD::ROTR, VT, Custom);
2011 }
2012
2013 // Custom legalize 2x32 to get a little better code.
2014 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2015 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2016
2017 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2018 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2019 setOperationAction(ISD::MSCATTER, VT, Custom);
2020
2021 if (Subtarget.hasDQI()) {
2022 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
2023 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
2024 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
2025 setOperationAction(Opc, MVT::v2i64, Custom);
2026 setOperationAction(Opc, MVT::v4i64, Custom);
2027 }
2028 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2029 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2030 }
2031
2032 if (Subtarget.hasCDI()) {
2033 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2034 setOperationAction(ISD::CTLZ, VT, Legal);
2035 }
2036 } // Subtarget.hasCDI()
2037
2038 if (Subtarget.hasVPOPCNTDQ()) {
2039 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2040 setOperationAction(ISD::CTPOP, VT, Legal);
2041 }
2042 }
2043
2044 // This block control legalization of v32i1/v64i1 which are available with
2045 // AVX512BW..
2046 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2047 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2048 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2049
2050 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2051 setOperationAction(ISD::VSELECT, VT, Expand);
2052 setOperationAction(ISD::TRUNCATE, VT, Custom);
2053 setOperationAction(ISD::SETCC, VT, Custom);
2054 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2055 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
2056 setOperationAction(ISD::SELECT, VT, Custom);
2057 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2058 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2059 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
2060 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
2061 }
2062
2063 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2064 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2065
2066 // Extends from v32i1 masks to 256-bit vectors.
2067 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
2068 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
2069 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
2070
2071 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2072 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2073 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2074 }
2075
2076 // These operations are handled on non-VLX by artificially widening in
2077 // isel patterns.
2078 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2079
2080 if (Subtarget.hasBITALG()) {
2081 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2082 setOperationAction(ISD::CTPOP, VT, Legal);
2083 }
2084 }
2085
2086 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2087 auto setGroup = [&] (MVT VT) {
2088 setOperationAction(ISD::FADD, VT, Legal);
2089 setOperationAction(ISD::STRICT_FADD, VT, Legal);
2090 setOperationAction(ISD::FSUB, VT, Legal);
2091 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
2092 setOperationAction(ISD::FMUL, VT, Legal);
2093 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
2094 setOperationAction(ISD::FDIV, VT, Legal);
2095 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
2096 setOperationAction(ISD::FSQRT, VT, Legal);
2097 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
2098
2099 setOperationAction(ISD::FFLOOR, VT, Legal);
2100 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
2101 setOperationAction(ISD::FCEIL, VT, Legal);
2102 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
2103 setOperationAction(ISD::FTRUNC, VT, Legal);
2104 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
2105 setOperationAction(ISD::FRINT, VT, Legal);
2106 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
2107 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2108 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
2109
2110 setOperationAction(ISD::FROUND, VT, Custom);
2111
2112 setOperationAction(ISD::LOAD, VT, Legal);
2113 setOperationAction(ISD::STORE, VT, Legal);
2114
2115 setOperationAction(ISD::FMA, VT, Legal);
2116 setOperationAction(ISD::STRICT_FMA, VT, Legal);
2117 setOperationAction(ISD::VSELECT, VT, Legal);
2118 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2119 setOperationAction(ISD::SELECT, VT, Custom);
2120
2121 setOperationAction(ISD::FNEG, VT, Custom);
2122 setOperationAction(ISD::FABS, VT, Custom);
2123 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
2124 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2125 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2126 };
2127
2128 // AVX512_FP16 scalar operations
2129 setGroup(MVT::f16);
2130 setOperationAction(ISD::FREM, MVT::f16, Promote);
2131 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
2132 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
2133 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2134 setOperationAction(ISD::SETCC, MVT::f16, Custom);
2135 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
2136 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
2137 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
2138 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2139 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
2140 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
2141 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
2142 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2143 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2144 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2145 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
2146
2147 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2148 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2149
2150 if (Subtarget.useAVX512Regs()) {
2151 setGroup(MVT::v32f16);
2152 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
2153 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
2154 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
2155 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
2156 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
2157 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2158 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
2159 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
2160 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
2161 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);
2162 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
2163 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
2164
2165 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
2166 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
2167 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
2168 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
2169 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2170 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2171 MVT::v32i16);
2172 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2173 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2174 MVT::v32i16);
2175 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2176 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2177 MVT::v32i16);
2178 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2179 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2180 MVT::v32i16);
2181
2182 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2183 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2184 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2185
2186 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2187 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2188
2189 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2190 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2191 }
2192
2193 if (Subtarget.hasVLX()) {
2194 setGroup(MVT::v8f16);
2195 setGroup(MVT::v16f16);
2196
2197 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2198 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2199 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2200 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2201 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2202 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2203 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2204 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2205 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2206 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2207
2208 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2209 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2210 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2211 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2212 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
2213 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2214 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
2215 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2216 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
2217 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
2218
2219 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2220 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2221 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2222
2223 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2224 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2225 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2226
2227 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2228 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2229 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2230 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2231
2232 // Need to custom widen these to prevent scalarization.
2233 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2234 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2235 }
2236 }
2237
2238 if (!Subtarget.useSoftFloat() &&
2239 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2240 addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
2241 addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
2242 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2243 // provide the method to promote BUILD_VECTOR. Set the operation action
2244 // Custom to do the customization later.
2245 setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2246 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2247 setF16Action(VT, Expand);
2248 setOperationAction(ISD::FADD, VT, Expand);
2249 setOperationAction(ISD::FSUB, VT, Expand);
2250 setOperationAction(ISD::FMUL, VT, Expand);
2251 setOperationAction(ISD::FDIV, VT, Expand);
2252 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2253 }
2254 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2255 }
2256
2257 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2258 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2259 setF16Action(MVT::v32bf16, Expand);
2260 setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2261 setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2262 setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2263 setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2264 setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2265 }
2266
2267 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2268 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2269 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2270 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2271 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2272 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2273
2274 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2275 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2276 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2277 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2278 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2279
2280 if (Subtarget.hasBWI()) {
2281 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2282 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2283 }
2284
2285 if (Subtarget.hasFP16()) {
2286 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2287 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2288 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2289 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2290 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2291 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2292 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2293 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2294 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2295 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2296 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2297 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2298 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2299 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2300 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2301 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2302 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2303 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2304 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2305 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2306 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2307 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2308 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2309 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2310 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2311 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2312 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2313 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2314 }
2315
2316 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2317 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2318 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2319 }
2320
2321 if (Subtarget.hasAMXTILE()) {
2322 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2323 }
2324
2325 // We want to custom lower some of our intrinsics.
2326 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2327 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2328 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2329 if (!Subtarget.is64Bit()) {
2330 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2331 }
2332
2333 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2334 // handle type legalization for these operations here.
2335 //
2336 // FIXME: We really should do custom legalization for addition and
2337 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2338 // than generic legalization for 64-bit multiplication-with-overflow, though.
2339 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2340 if (VT == MVT::i64 && !Subtarget.is64Bit())
2341 continue;
2342 // Add/Sub/Mul with overflow operations are custom lowered.
2343 setOperationAction(ISD::SADDO, VT, Custom);
2344 setOperationAction(ISD::UADDO, VT, Custom);
2345 setOperationAction(ISD::SSUBO, VT, Custom);
2346 setOperationAction(ISD::USUBO, VT, Custom);
2347 setOperationAction(ISD::SMULO, VT, Custom);
2348 setOperationAction(ISD::UMULO, VT, Custom);
2349
2350 // Support carry in as value rather than glue.
2351 setOperationAction(ISD::UADDO_CARRY, VT, Custom);
2352 setOperationAction(ISD::USUBO_CARRY, VT, Custom);
2353 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2354 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2355 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2356 }
2357
2358 if (!Subtarget.is64Bit()) {
2359 // These libcalls are not available in 32-bit.
2360 setLibcallName(RTLIB::SHL_I128, nullptr);
2361 setLibcallName(RTLIB::SRL_I128, nullptr);
2362 setLibcallName(RTLIB::SRA_I128, nullptr);
2363 setLibcallName(RTLIB::MUL_I128, nullptr);
2364 // The MULO libcall is not part of libgcc, only compiler-rt.
2365 setLibcallName(RTLIB::MULO_I64, nullptr);
2366 }
2367 // The MULO libcall is not part of libgcc, only compiler-rt.
2368 setLibcallName(RTLIB::MULO_I128, nullptr);
2369
2370 // Combine sin / cos into _sincos_stret if it is available.
2371 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2372 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2373 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2374 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2375 }
2376
2377 if (Subtarget.isTargetWin64()) {
2378 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2379 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2380 setOperationAction(ISD::SREM, MVT::i128, Custom);
2381 setOperationAction(ISD::UREM, MVT::i128, Custom);
2382 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2383 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2384 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2385 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2386 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2387 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2388 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2389 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2390 }
2391
2392 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2393 // is. We should promote the value to 64-bits to solve this.
2394 // This is what the CRT headers do - `fmodf` is an inline header
2395 // function casting to f64 and calling `fmod`.
2396 if (Subtarget.is32Bit() &&
2397 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2398 for (ISD::NodeType Op :
2399 {ISD::FCEIL, ISD::STRICT_FCEIL,
2400 ISD::FCOS, ISD::STRICT_FCOS,
2401 ISD::FEXP, ISD::STRICT_FEXP,
2402 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2403 ISD::FREM, ISD::STRICT_FREM,
2404 ISD::FLOG, ISD::STRICT_FLOG,
2405 ISD::FLOG10, ISD::STRICT_FLOG10,
2406 ISD::FPOW, ISD::STRICT_FPOW,
2407 ISD::FSIN, ISD::STRICT_FSIN})
2408 if (isOperationExpand(Op, MVT::f32))
2409 setOperationAction(Op, MVT::f32, Promote);
2410
2411 // We have target-specific dag combine patterns for the following nodes:
2412 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2413 ISD::SCALAR_TO_VECTOR,
2414 ISD::INSERT_VECTOR_ELT,
2415 ISD::EXTRACT_VECTOR_ELT,
2416 ISD::CONCAT_VECTORS,
2417 ISD::INSERT_SUBVECTOR,
2418 ISD::EXTRACT_SUBVECTOR,
2419 ISD::BITCAST,
2420 ISD::VSELECT,
2421 ISD::SELECT,
2422 ISD::SHL,
2423 ISD::SRA,
2424 ISD::SRL,
2425 ISD::OR,
2426 ISD::AND,
2427 ISD::ADD,
2428 ISD::FADD,
2429 ISD::FSUB,
2430 ISD::FNEG,
2431 ISD::FMA,
2432 ISD::STRICT_FMA,
2433 ISD::FMINNUM,
2434 ISD::FMAXNUM,
2435 ISD::SUB,
2436 ISD::LOAD,
2437 ISD::MLOAD,
2438 ISD::STORE,
2439 ISD::MSTORE,
2440 ISD::TRUNCATE,
2441 ISD::ZERO_EXTEND,
2442 ISD::ANY_EXTEND,
2443 ISD::SIGN_EXTEND,
2444 ISD::SIGN_EXTEND_INREG,
2445 ISD::ANY_EXTEND_VECTOR_INREG,
2446 ISD::SIGN_EXTEND_VECTOR_INREG,
2447 ISD::ZERO_EXTEND_VECTOR_INREG,
2448 ISD::SINT_TO_FP,
2449 ISD::UINT_TO_FP,
2450 ISD::STRICT_SINT_TO_FP,
2451 ISD::STRICT_UINT_TO_FP,
2452 ISD::SETCC,
2453 ISD::MUL,
2454 ISD::XOR,
2455 ISD::MSCATTER,
2456 ISD::MGATHER,
2457 ISD::FP16_TO_FP,
2458 ISD::FP_EXTEND,
2459 ISD::STRICT_FP_EXTEND,
2460 ISD::FP_ROUND,
2461 ISD::STRICT_FP_ROUND});
2462
2463 computeRegisterProperties(Subtarget.getRegisterInfo());
2464
2465 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2466 MaxStoresPerMemsetOptSize = 8;
2467 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2468 MaxStoresPerMemcpyOptSize = 4;
2469 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2470 MaxStoresPerMemmoveOptSize = 4;
2471
2472 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2473 // that needs to benchmarked and balanced with the potential use of vector
2474 // load/store types (PR33329, PR33914).
2475 MaxLoadsPerMemcmp = 2;
2476 MaxLoadsPerMemcmpOptSize = 2;
2477
2478 // Default loop alignment, which can be overridden by -align-loops.
2479 setPrefLoopAlignment(Align(16));
2480
2481 // An out-of-order CPU can speculatively execute past a predictable branch,
2482 // but a conditional move could be stalled by an expensive earlier operation.
2483 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2484 EnableExtLdPromotion = true;
2485 setPrefFunctionAlignment(Align(16));
2486
2487 verifyIntrinsicTables();
2488
2489 // Default to having -disable-strictnode-mutation on
2490 IsStrictFPEnabled = true;
2491}
2492
2493// This has so far only been implemented for 64-bit MachO.
2494bool X86TargetLowering::useLoadStackGuardNode() const {
2495 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2496}
2497
2498bool X86TargetLowering::useStackGuardXorFP() const {
2499 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2500 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2501}
2502
2503SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2504 const SDLoc &DL) const {
2505 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2506 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2507 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2508 return SDValue(Node, 0);
2509}
2510
2511TargetLoweringBase::LegalizeTypeAction
2512X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2513 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2514 !Subtarget.hasBWI())
2515 return TypeSplitVector;
2516
2517 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2518 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2519 return TypeSplitVector;
2520
2521 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2522 VT.getVectorElementType() != MVT::i1)
2523 return TypeWidenVector;
2524
2525 return TargetLoweringBase::getPreferredVectorAction(VT);
2526}
2527
2528static std::pair<MVT, unsigned>
2529handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2530 const X86Subtarget &Subtarget) {
2531 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2532 // convention is one that uses k registers.
2533 if (NumElts == 2)
2534 return {MVT::v2i64, 1};
2535 if (NumElts == 4)
2536 return {MVT::v4i32, 1};
2537 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2538 CC != CallingConv::Intel_OCL_BI)
2539 return {MVT::v8i16, 1};
2540 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2541 CC != CallingConv::Intel_OCL_BI)
2542 return {MVT::v16i8, 1};
2543 // v32i1 passes in ymm unless we have BWI and the calling convention is
2544 // regcall.
2545 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2546 return {MVT::v32i8, 1};
2547 // Split v64i1 vectors if we don't have v64i8 available.
2548 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2549 if (Subtarget.useAVX512Regs())
2550 return {MVT::v64i8, 1};
2551 return {MVT::v32i8, 2};
2552 }
2553
2554 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2555 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2556 NumElts > 64)
2557 return {MVT::i8, NumElts};
2558
2559 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2560}
2561
2562MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2563 CallingConv::ID CC,
2564 EVT VT) const {
2565 if (VT.isVector()) {
2566 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2567 unsigned NumElts = VT.getVectorNumElements();
2568
2569 MVT RegisterVT;
2570 unsigned NumRegisters;
2571 std::tie(RegisterVT, NumRegisters) =
2572 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2573 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2574 return RegisterVT;
2575 }
2576
2577 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2578 return MVT::v8f16;
2579 }
2580
2581 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2582 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2583 !Subtarget.hasX87())
2584 return MVT::i32;
2585
2586 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2587 return getRegisterTypeForCallingConv(Context, CC,
2588 VT.changeVectorElementTypeToInteger());
2589
2590 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2591}
2592
2593unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2594 CallingConv::ID CC,
2595 EVT VT) const {
2596 if (VT.isVector()) {
2597 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2598 unsigned NumElts = VT.getVectorNumElements();
2599
2600 MVT RegisterVT;
2601 unsigned NumRegisters;
2602 std::tie(RegisterVT, NumRegisters) =
2603 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2604 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2605 return NumRegisters;
2606 }
2607
2608 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2609 return 1;
2610 }
2611
2612 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2613 // x87 is disabled.
2614 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2615 if (VT == MVT::f64)
2616 return 2;
2617 if (VT == MVT::f80)
2618 return 3;
2619 }
2620
2621 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2622 return getNumRegistersForCallingConv(Context, CC,
2623 VT.changeVectorElementTypeToInteger());
2624
2625 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2626}
2627
2628unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2629 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2630 unsigned &NumIntermediates, MVT &RegisterVT) const {
2631 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2632 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2633 Subtarget.hasAVX512() &&
2634 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2635 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2636 VT.getVectorNumElements() > 64)) {
2637 RegisterVT = MVT::i8;
2638 IntermediateVT = MVT::i1;
2639 NumIntermediates = VT.getVectorNumElements();
2640 return NumIntermediates;
2641 }
2642
2643 // Split v64i1 vectors if we don't have v64i8 available.
2644 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2645 CC != CallingConv::X86_RegCall) {
2646 RegisterVT = MVT::v32i8;
2647 IntermediateVT = MVT::v32i1;
2648 NumIntermediates = 2;
2649 return 2;
2650 }
2651
2652 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2653 NumIntermediates, RegisterVT);
2654}
2655
2656EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2657 LLVMContext& Context,
2658 EVT VT) const {
2659 if (!VT.isVector())
2660 return MVT::i8;
2661
2662 if (Subtarget.hasAVX512()) {
2663 // Figure out what this type will be legalized to.
2664 EVT LegalVT = VT;
2665 while (getTypeAction(Context, LegalVT) != TypeLegal)
2666 LegalVT = getTypeToTransformTo(Context, LegalVT);
2667
2668 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2669 if (LegalVT.getSimpleVT().is512BitVector())
2670 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2671
2672 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2673 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2674 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2675 // vXi16/vXi8.
2676 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2677 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2678 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2679 }
2680 }
2681
2682 return VT.changeVectorElementTypeToInteger();
2683}
2684
2685/// Helper for getByValTypeAlignment to determine
2686/// the desired ByVal argument alignment.
2687static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2688 if (MaxAlign == 16)
2689 return;
2690 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2691 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
2692 MaxAlign = Align(16);
2693 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2694 Align EltAlign;
2695 getMaxByValAlign(ATy->getElementType(), EltAlign);
2696 if (EltAlign > MaxAlign)
2697 MaxAlign = EltAlign;
2698 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2699 for (auto *EltTy : STy->elements()) {
2700 Align EltAlign;
2701 getMaxByValAlign(EltTy, EltAlign);
2702 if (EltAlign > MaxAlign)
2703 MaxAlign = EltAlign;
2704 if (MaxAlign == 16)
2705 break;
2706 }
2707 }
2708}
2709
2710/// Return the desired alignment for ByVal aggregate
2711/// function arguments in the caller parameter area. For X86, aggregates
2712/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2713/// are at 4-byte boundaries.
2714uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2715 const DataLayout &DL) const {
2716 if (Subtarget.is64Bit()) {
2717 // Max of 8 and alignment of type.
2718 Align TyAlign = DL.getABITypeAlign(Ty);
2719 if (TyAlign > 8)
2720 return TyAlign.value();
2721 return 8;
2722 }
2723
2724 Align Alignment(4);
2725 if (Subtarget.hasSSE1())
2726 getMaxByValAlign(Ty, Alignment);
2727 return Alignment.value();
2728}
2729
2730/// It returns EVT::Other if the type should be determined using generic
2731/// target-independent logic.
2732/// For vector ops we check that the overall size isn't larger than our
2733/// preferred vector width.
2734EVT X86TargetLowering::getOptimalMemOpType(
2735 const MemOp &Op, const AttributeList &FuncAttributes) const {
2736 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2737 if (Op.size() >= 16 &&
2738 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2739 // FIXME: Check if unaligned 64-byte accesses are slow.
2740 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2741 (Subtarget.getPreferVectorWidth() >= 512)) {
2742 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2743 }
2744 // FIXME: Check if unaligned 32-byte accesses are slow.
2745 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2746 Subtarget.useLight256BitInstructions()) {
2747 // Although this isn't a well-supported type for AVX1, we'll let
2748 // legalization and shuffle lowering produce the optimal codegen. If we
2749 // choose an optimal type with a vector element larger than a byte,
2750 // getMemsetStores() may create an intermediate splat (using an integer
2751 // multiply) before we splat as a vector.
2752 return MVT::v32i8;
2753 }
2754 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2755 return MVT::v16i8;
2756 // TODO: Can SSE1 handle a byte vector?
2757 // If we have SSE1 registers we should be able to use them.
2758 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2759 (Subtarget.getPreferVectorWidth() >= 128))
2760 return MVT::v4f32;
2761 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2762 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2763 // Do not use f64 to lower memcpy if source is string constant. It's
2764 // better to use i32 to avoid the loads.
2765 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2766 // The gymnastics of splatting a byte value into an XMM register and then
2767 // only using 8-byte stores (because this is a CPU with slow unaligned
2768 // 16-byte accesses) makes that a loser.
2769 return MVT::f64;
2770 }
2771 }
2772 // This is a compromise. If we reach here, unaligned accesses may be slow on
2773 // this target. However, creating smaller, aligned accesses could be even
2774 // slower and would certainly be a lot more code.
2775 if (Subtarget.is64Bit() && Op.size() >= 8)
2776 return MVT::i64;
2777 return MVT::i32;
2778}
2779
2780bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2781 if (VT == MVT::f32)
2782 return Subtarget.hasSSE1();
2783 if (VT == MVT::f64)
2784 return Subtarget.hasSSE2();
2785 return true;
2786}
2787
2788static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
2789 return (8 * Alignment.value()) % SizeInBits == 0;
2790}
2791
2792bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
2793 if (isBitAligned(Alignment, VT.getSizeInBits()))
2794 return true;
2795 switch (VT.getSizeInBits()) {
2796 default:
2797 // 8-byte and under are always assumed to be fast.
2798 return true;
2799 case 128:
2800 return !Subtarget.isUnalignedMem16Slow();
2801 case 256:
2802 return !Subtarget.isUnalignedMem32Slow();
2803 // TODO: What about AVX-512 (512-bit) accesses?
2804 }
2805}
2806
2807bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2808 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2809 unsigned *Fast) const {
2810 if (Fast)
2811 *Fast = isMemoryAccessFast(VT, Alignment);
2812 // NonTemporal vector memory ops must be aligned.
2813 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2814 // NT loads can only be vector aligned, so if its less aligned than the
2815 // minimum vector size (which we can split the vector down to), we might as
2816 // well use a regular unaligned vector load.
2817 // We don't have any NT loads pre-SSE41.
2818 if (!!(Flags & MachineMemOperand::MOLoad))
2819 return (Alignment < 16 || !Subtarget.hasSSE41());
2820 return false;
2821 }
2822 // Misaligned accesses of any size are always allowed.
2823 return true;
2824}
2825
2826bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
2827 const DataLayout &DL, EVT VT,
2828 unsigned AddrSpace, Align Alignment,
2829 MachineMemOperand::Flags Flags,
2830 unsigned *Fast) const {
2831 if (Fast)
2832 *Fast = isMemoryAccessFast(VT, Alignment);
2833 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2834 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
2835 /*Fast=*/nullptr))
2836 return true;
2837 // NonTemporal vector memory ops are special, and must be aligned.
2838 if (!isBitAligned(Alignment, VT.getSizeInBits()))
2839 return false;
2840 switch (VT.getSizeInBits()) {
2841 case 128:
2842 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
2843 return true;
2844 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
2845 return true;
2846 return false;
2847 case 256:
2848 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
2849 return true;
2850 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
2851 return true;
2852 return false;
2853 case 512:
2854 if (Subtarget.hasAVX512())
2855 return true;
2856 return false;
2857 default:
2858 return false; // Don't have NonTemporal vector memory ops of this size.
2859 }
2860 }
2861 return true;
2862}
2863
2864/// Return the entry encoding for a jump table in the
2865/// current function. The returned value is a member of the
2866/// MachineJumpTableInfo::JTEntryKind enum.
2867unsigned X86TargetLowering::getJumpTableEncoding() const {
2868 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2869 // symbol.
2870 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2871 return MachineJumpTableInfo::EK_Custom32;
2872
2873 // Otherwise, use the normal jump table encoding heuristics.
2874 return TargetLowering::getJumpTableEncoding();
2875}
2876
2877bool X86TargetLowering::splitValueIntoRegisterParts(
2878 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
2879 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
2880 bool IsABIRegCopy = CC.has_value();
2881 EVT ValueVT = Val.getValueType();
2882 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2883 unsigned ValueBits = ValueVT.getSizeInBits();
2884 unsigned PartBits = PartVT.getSizeInBits();
2885 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
2886 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
2887 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
2888 Parts[0] = Val;
2889 return true;
2890 }
2891 return false;
2892}
2893
2894SDValue X86TargetLowering::joinRegisterPartsIntoValue(
2895 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
2896 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
2897 bool IsABIRegCopy = CC.has_value();
2898 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2899 unsigned ValueBits = ValueVT.getSizeInBits();
2900 unsigned PartBits = PartVT.getSizeInBits();
2901 SDValue Val = Parts[0];
2902
2903 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
2904 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
2905 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
2906 return Val;
2907 }
2908 return SDValue();
2909}
2910
2911bool X86TargetLowering::useSoftFloat() const {
2912 return Subtarget.useSoftFloat();
2913}
2914
2915void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2916 ArgListTy &Args) const {
2917
2918 // Only relabel X86-32 for C / Stdcall CCs.
2919 if (Subtarget.is64Bit())
2920 return;
2921 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2922 return;
2923 unsigned ParamRegs = 0;
2924 if (auto *M = MF->getFunction().getParent())
2925 ParamRegs = M->getNumberRegisterParameters();
2926
2927 // Mark the first N int arguments as having reg
2928 for (auto &Arg : Args) {
2929 Type *T = Arg.Ty;
2930 if (T->isIntOrPtrTy())
2931 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2932 unsigned numRegs = 1;
2933 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2934 numRegs = 2;
2935 if (ParamRegs < numRegs)
2936 return;
2937 ParamRegs -= numRegs;
2938 Arg.IsInReg = true;
2939 }
2940 }
2941}
2942
2943const MCExpr *
2944X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2945 const MachineBasicBlock *MBB,
2946 unsigned uid,MCContext &Ctx) const{
2947 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2947, __extension__
__PRETTY_FUNCTION__))
;
2948 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2949 // entries.
2950 return MCSymbolRefExpr::create(MBB->getSymbol(),
2951 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2952}
2953
2954/// Returns relocation base for the given PIC jumptable.
2955SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2956 SelectionDAG &DAG) const {
2957 if (!Subtarget.is64Bit())
2958 // This doesn't have SDLoc associated with it, but is not really the
2959 // same as a Register.
2960 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2961 getPointerTy(DAG.getDataLayout()));
2962 return Table;
2963}
2964
2965/// This returns the relocation base for the given PIC jumptable,
2966/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2967const MCExpr *X86TargetLowering::
2968getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2969 MCContext &Ctx) const {
2970 // X86-64 uses RIP relative addressing based on the jump table label.
2971 if (Subtarget.isPICStyleRIPRel())
2972 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2973
2974 // Otherwise, the reference is relative to the PIC base.
2975 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2976}
2977
2978std::pair<const TargetRegisterClass *, uint8_t>
2979X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2980 MVT VT) const {
2981 const TargetRegisterClass *RRC = nullptr;
2982 uint8_t Cost = 1;
2983 switch (VT.SimpleTy) {
2984 default:
2985 return TargetLowering::findRepresentativeClass(TRI, VT);
2986 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2987 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2988 break;
2989 case MVT::x86mmx:
2990 RRC = &X86::VR64RegClass;
2991 break;
2992 case MVT::f32: case MVT::f64:
2993 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2994 case MVT::v4f32: case MVT::v2f64:
2995 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2996 case MVT::v8f32: case MVT::v4f64:
2997 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2998 case MVT::v16f32: case MVT::v8f64:
2999 RRC = &X86::VR128XRegClass;
3000 break;
3001 }
3002 return std::make_pair(RRC, Cost);
3003}
3004
3005unsigned X86TargetLowering::getAddressSpace() const {
3006 if (Subtarget.is64Bit())
3007 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
3008 return 256;
3009}
3010
3011static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
3012 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
3013 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
3014}
3015
3016static Constant* SegmentOffset(IRBuilderBase &IRB,
3017 int Offset, unsigned AddressSpace) {
3018 return ConstantExpr::getIntToPtr(
3019 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
3020 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
3021}
3022
3023Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
3024 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
3025 // tcbhead_t; use it instead of the usual global variable (see
3026 // sysdeps/{i386,x86_64}/nptl/tls.h)
3027 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
3028 if (Subtarget.isTargetFuchsia()) {
3029 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
3030 return SegmentOffset(IRB, 0x10, getAddressSpace());
3031 } else {
3032 unsigned AddressSpace = getAddressSpace();
3033 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
3034 // Specially, some users may customize the base reg and offset.
3035 int Offset = M->getStackProtectorGuardOffset();
3036 // If we don't set -stack-protector-guard-offset value:
3037 // %fs:0x28, unless we're using a Kernel code model, in which case
3038 // it's %gs:0x28. gs:0x14 on i386.
3039 if (Offset == INT_MAX2147483647)
3040 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
3041
3042 StringRef GuardReg = M->getStackProtectorGuardReg();
3043 if (GuardReg == "fs")
3044 AddressSpace = X86AS::FS;
3045 else if (GuardReg == "gs")
3046 AddressSpace = X86AS::GS;
3047
3048 // Use symbol guard if user specify.
3049 StringRef GuardSymb = M->getStackProtectorGuardSymbol();
3050 if (!GuardSymb.empty()) {
3051 GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
3052 if (!GV) {
3053 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
3054 : Type::getInt32Ty(M->getContext());
3055 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
3056 nullptr, GuardSymb, nullptr,
3057 GlobalValue::NotThreadLocal, AddressSpace);
3058 }
3059 return GV;
3060 }
3061
3062 return SegmentOffset(IRB, Offset, AddressSpace);
3063 }
3064 }
3065 return TargetLowering::getIRStackGuard(IRB);
3066}
3067
3068void X86TargetLowering::insertSSPDeclarations(Module &M) const {
3069 // MSVC CRT provides functionalities for stack protection.
3070 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3071 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3072 // MSVC CRT has a global variable holding security cookie.
3073 M.getOrInsertGlobal("__security_cookie",
3074 Type::getInt8PtrTy(M.getContext()));
3075
3076 // MSVC CRT has a function to validate security cookie.
3077 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
3078 "__security_check_cookie", Type::getVoidTy(M.getContext()),
3079 Type::getInt8PtrTy(M.getContext()));
3080 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
3081 F->setCallingConv(CallingConv::X86_FastCall);
3082 F->addParamAttr(0, Attribute::AttrKind::InReg);
3083 }
3084 return;
3085 }
3086
3087 StringRef GuardMode = M.getStackProtectorGuard();
3088
3089 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
3090 if ((GuardMode == "tls" || GuardMode.empty()) &&
3091 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
3092 return;
3093 TargetLowering::insertSSPDeclarations(M);
3094}
3095
3096Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
3097 // MSVC CRT has a global variable holding security cookie.
3098 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3099 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3100 return M.getGlobalVariable("__security_cookie");
3101 }
3102 return TargetLowering::getSDagStackGuard(M);
3103}
3104
3105Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
3106 // MSVC CRT has a function to validate security cookie.
3107 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3108 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3109 return M.getFunction("__security_check_cookie");
3110 }
3111 return TargetLowering::getSSPStackGuardCheck(M);
3112}
3113
3114Value *
3115X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
3116 if (Subtarget.getTargetTriple().isOSContiki())
3117 return getDefaultSafeStackPointerLocation(IRB, false);
3118
3119 // Android provides a fixed TLS slot for the SafeStack pointer. See the
3120 // definition of TLS_SLOT_SAFESTACK in
3121 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
3122 if (Subtarget.isTargetAndroid()) {
3123 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
3124 // %gs:0x24 on i386
3125 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
3126 return SegmentOffset(IRB, Offset, getAddressSpace());
3127 }
3128
3129 // Fuchsia is similar.
3130 if (Subtarget.isTargetFuchsia()) {
3131 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
3132 return SegmentOffset(IRB, 0x18, getAddressSpace());
3133 }
3134
3135 return TargetLowering::getSafeStackPointerLocation(IRB);
3136}
3137
3138//===----------------------------------------------------------------------===//
3139// Return Value Calling Convention Implementation
3140//===----------------------------------------------------------------------===//
3141
3142bool X86TargetLowering::CanLowerReturn(
3143 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3144 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3145 SmallVector<CCValAssign, 16> RVLocs;
3146 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3147 return CCInfo.CheckReturn(Outs, RetCC_X86);
3148}
3149
3150const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
3151 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
3152 return ScratchRegs;
3153}
3154
3155ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
3156 // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
3157 // tests at the moment, which is not what we expected.
3158 static const MCPhysReg RCRegs[] = {X86::MXCSR};
3159 return RCRegs;
3160}
3161
3162/// Lowers masks values (v*i1) to the local register values
3163/// \returns DAG node after lowering to register type
3164static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
3165 const SDLoc &Dl, SelectionDAG &DAG) {
3166 EVT ValVT = ValArg.getValueType();
3167
3168 if (ValVT == MVT::v1i1)
3169 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
3170 DAG.getIntPtrConstant(0, Dl));
3171
3172 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
3173 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
3174 // Two stage lowering might be required
3175 // bitcast: v8i1 -> i8 / v16i1 -> i16
3176 // anyextend: i8 -> i32 / i16 -> i32
3177 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
3178 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
3179 if (ValLoc == MVT::i32)
3180 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
3181 return ValToCopy;
3182 }
3183
3184 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
3185 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
3186 // One stage lowering is required
3187 // bitcast: v32i1 -> i32 / v64i1 -> i64
3188 return DAG.getBitcast(ValLoc, ValArg);
3189 }
3190
3191 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
3192}
3193
3194/// Breaks v64i1 value into two registers and adds the new node to the DAG
3195static void Passv64i1ArgInRegs(
3196 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
3197 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
3198 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
3199 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3199, __extension__
__PRETTY_FUNCTION__))
;
3200 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
;
3201 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3201, __extension__
__PRETTY_FUNCTION__))
;
3202 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__
__PRETTY_FUNCTION__))
3203 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__
__PRETTY_FUNCTION__))
;
3204
3205 // Before splitting the value we cast it to i64
3206 Arg = DAG.getBitcast(MVT::i64, Arg);
3207
3208 // Splitting the value into two i32 types
3209 SDValue Lo, Hi;
3210 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);
3211
3212 // Attach the two i32 types into corresponding registers
3213 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
3214 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
3215}
3216
3217SDValue
3218X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3219 bool isVarArg,
3220 const SmallVectorImpl<ISD::OutputArg> &Outs,
3221 const SmallVectorImpl<SDValue> &OutVals,
3222 const SDLoc &dl, SelectionDAG &DAG) const {
3223 MachineFunction &MF = DAG.getMachineFunction();
3224 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3225
3226 // In some cases we need to disable registers from the default CSR list.
3227 // For example, when they are used as return registers (preserve_* and X86's
3228 // regcall) or for argument passing (X86's regcall).
3229 bool ShouldDisableCalleeSavedRegister =
3230 shouldDisableRetRegFromCSR(CallConv) ||
3231 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
3232
3233 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
3234 report_fatal_error("X86 interrupts may not return any value");
3235
3236 SmallVector<CCValAssign, 16> RVLocs;
3237 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
3238 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
3239
3240 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
3241 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
3242 ++I, ++OutsIndex) {
3243 CCValAssign &VA = RVLocs[I];
3244 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3244, __extension__
__PRETTY_FUNCTION__))
;
3245
3246 // Add the register to the CalleeSaveDisableRegs list.
3247 if (ShouldDisableCalleeSavedRegister)
3248 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
3249
3250 SDValue ValToCopy = OutVals[OutsIndex];
3251 EVT ValVT = ValToCopy.getValueType();
3252
3253 // Promote values to the appropriate types.
3254 if (VA.getLocInfo() == CCValAssign::SExt)
3255 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
3256 else if (VA.getLocInfo() == CCValAssign::ZExt)
3257 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
3258 else if (VA.getLocInfo() == CCValAssign::AExt) {
3259 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
3260 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
3261 else
3262 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
3263 }
3264 else if (VA.getLocInfo() == CCValAssign::BCvt)
3265 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
3266
3267 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__
__PRETTY_FUNCTION__))
3268 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__
__PRETTY_FUNCTION__))
;
3269
3270 // Report an error if we have attempted to return a value via an XMM
3271 // register and SSE was disabled.
3272 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3273 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3274 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3275 } else if (!Subtarget.hasSSE2() &&
3276 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3277 ValVT == MVT::f64) {
3278 // When returning a double via an XMM register, report an error if SSE2 is
3279 // not enabled.
3280 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3281 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3282 }
3283
3284 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
3285 // the RET instruction and handled by the FP Stackifier.
3286 if (VA.getLocReg() == X86::FP0 ||
3287 VA.getLocReg() == X86::FP1) {
3288 // If this is a copy from an xmm register to ST(0), use an FPExtend to
3289 // change the value to the FP stack register class.
3290 if (isScalarFPTypeInSSEReg(VA.getValVT()))
3291 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
3292 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3293 // Don't emit a copytoreg.
3294 continue;
3295 }
3296
3297 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
3298 // which is returned in RAX / RDX.
3299 if (Subtarget.is64Bit()) {
3300 if (ValVT == MVT::x86mmx) {
3301 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
3302 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
3303 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
3304 ValToCopy);
3305 // If we don't have SSE2 available, convert to v4f32 so the generated
3306 // register is legal.
3307 if (!Subtarget.hasSSE2())
3308 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3309 }
3310 }
3311 }
3312
3313 if (VA.needsCustom()) {
3314 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__
__PRETTY_FUNCTION__))
3315 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__
__PRETTY_FUNCTION__))
;
3316
3317 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3318 Subtarget);
3319
3320 // Add the second register to the CalleeSaveDisableRegs list.
3321 if (ShouldDisableCalleeSavedRegister)
3322 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3323 } else {
3324 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3325 }
3326 }
3327
3328 SDValue Glue;
3329 SmallVector<SDValue, 6> RetOps;
3330 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3331 // Operand #1 = Bytes To Pop
3332 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3333 MVT::i32));
3334
3335 // Copy the result values into the output registers.
3336 for (auto &RetVal : RetVals) {
3337 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3338 RetOps.push_back(RetVal.second);
3339 continue; // Don't emit a copytoreg.
3340 }
3341
3342 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
3343 Glue = Chain.getValue(1);
3344 RetOps.push_back(
3345 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3346 }
3347
3348 // Swift calling convention does not require we copy the sret argument
3349 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3350
3351 // All x86 ABIs require that for returning structs by value we copy
3352 // the sret argument into %rax/%eax (depending on ABI) for the return.
3353 // We saved the argument into a virtual register in the entry block,
3354 // so now we copy the value out and into %rax/%eax.
3355 //
3356 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3357 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3358 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3359 // either case FuncInfo->setSRetReturnReg() will have been called.
3360 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3361 // When we have both sret and another return value, we should use the
3362 // original Chain stored in RetOps[0], instead of the current Chain updated
3363 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3364
3365 // For the case of sret and another return value, we have
3366 // Chain_0 at the function entry
3367 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3368 // If we use Chain_1 in getCopyFromReg, we will have
3369 // Val = getCopyFromReg(Chain_1)
3370 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3371
3372 // getCopyToReg(Chain_0) will be glued together with
3373 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3374 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3375 // Data dependency from Unit B to Unit A due to usage of Val in
3376 // getCopyToReg(Chain_1, Val)
3377 // Chain dependency from Unit A to Unit B
3378
3379 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3380 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3381 getPointerTy(MF.getDataLayout()));
3382
3383 Register RetValReg
3384 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3385 X86::RAX : X86::EAX;
3386 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
3387 Glue = Chain.getValue(1);
3388
3389 // RAX/EAX now acts like a return value.
3390 RetOps.push_back(
3391 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3392
3393 // Add the returned register to the CalleeSaveDisableRegs list. Don't do
3394 // this however for preserve_most/preserve_all to minimize the number of
3395 // callee-saved registers for these CCs.
3396 if (ShouldDisableCalleeSavedRegister &&
3397 CallConv != CallingConv::PreserveAll &&
3398 CallConv != CallingConv::PreserveMost)
3399 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3400 }
3401
3402 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3403 const MCPhysReg *I =
3404 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3405 if (I) {
3406 for (; *I; ++I) {
3407 if (X86::GR64RegClass.contains(*I))
3408 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3409 else
3410 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3410)
;
3411 }
3412 }
3413
3414 RetOps[0] = Chain; // Update chain.
3415
3416 // Add the glue if we have it.
3417 if (Glue.getNode())
3418 RetOps.push_back(Glue);
3419
3420 X86ISD::NodeType opcode = X86ISD::RET_GLUE;
3421 if (CallConv == CallingConv::X86_INTR)
3422 opcode = X86ISD::IRET;
3423 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3424}
3425
3426bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3427 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3428 return false;
3429
3430 SDValue TCChain = Chain;
3431 SDNode *Copy = *N->use_begin();
3432 if (Copy->getOpcode() == ISD::CopyToReg) {
3433 // If the copy has a glue operand, we conservatively assume it isn't safe to
3434 // perform a tail call.
3435 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3436 return false;
3437 TCChain = Copy->getOperand(0);
3438 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3439 return false;
3440
3441 bool HasRet = false;
3442 for (const SDNode *U : Copy->uses()) {
3443 if (U->getOpcode() != X86ISD::RET_GLUE)
3444 return false;
3445 // If we are returning more than one value, we can definitely
3446 // not make a tail call see PR19530
3447 if (U->getNumOperands() > 4)
3448 return false;
3449 if (U->getNumOperands() == 4 &&
3450 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3451 return false;
3452 HasRet = true;
3453 }
3454
3455 if (!HasRet)
3456 return false;
3457
3458 Chain = TCChain;
3459 return true;
3460}
3461
3462EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3463 ISD::NodeType ExtendKind) const {
3464 MVT ReturnMVT = MVT::i32;
3465
3466 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3467 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3468 // The ABI does not require i1, i8 or i16 to be extended.
3469 //
3470 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3471 // always extending i8/i16 return values, so keep doing that for now.
3472 // (PR26665).
3473 ReturnMVT = MVT::i8;
3474 }
3475
3476 EVT MinVT = getRegisterType(Context, ReturnMVT);
3477 return VT.bitsLT(MinVT) ? MinVT : VT;
3478}
3479
3480/// Reads two 32 bit registers and creates a 64 bit mask value.
3481/// \param VA The current 32 bit value that need to be assigned.
3482/// \param NextVA The next 32 bit value that need to be assigned.
3483/// \param Root The parent DAG node.
3484/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
3485/// glue purposes. In the case the DAG is already using
3486/// physical register instead of virtual, we should glue
3487/// our new SDValue to InGlue SDvalue.
3488/// \return a new SDvalue of size 64bit.
3489static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3490 SDValue &Root, SelectionDAG &DAG,
3491 const SDLoc &Dl, const X86Subtarget &Subtarget,
3492 SDValue *InGlue = nullptr) {
3493 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3493, __extension__
__PRETTY_FUNCTION__))
;
3494 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3494, __extension__
__PRETTY_FUNCTION__))
;
3495 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__
__PRETTY_FUNCTION__))
3496 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__
__PRETTY_FUNCTION__))
;
3497 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__
__PRETTY_FUNCTION__))
3498 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__
__PRETTY_FUNCTION__))
;
3499 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__))
3500 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__))
;
3501
3502 SDValue Lo, Hi;
3503 SDValue ArgValueLo, ArgValueHi;
3504
3505 MachineFunction &MF = DAG.getMachineFunction();
3506 const TargetRegisterClass *RC = &X86::GR32RegClass;
3507
3508 // Read a 32 bit value from the registers.
3509 if (nullptr == InGlue) {
3510 // When no physical register is present,
3511 // create an intermediate virtual register.
3512 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3513 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3514 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3515 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3516 } else {
3517 // When a physical register is available read the value from it and glue
3518 // the reads together.
3519 ArgValueLo =
3520 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);
3521 *InGlue = ArgValueLo.getValue(2);
3522 ArgValueHi =
3523 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);
3524 *InGlue = ArgValueHi.getValue(2);
3525 }
3526
3527 // Convert the i32 type into v32i1 type.
3528 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3529
3530 // Convert the i32 type into v32i1 type.
3531 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3532
3533 // Concatenate the two values together.
3534 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3535}
3536
3537/// The function will lower a register of various sizes (8/16/32/64)
3538/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3539/// \returns a DAG node contains the operand after lowering to mask type.
3540static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3541 const EVT &ValLoc, const SDLoc &Dl,
3542 SelectionDAG &DAG) {
3543 SDValue ValReturned = ValArg;
3544
3545 if (ValVT == MVT::v1i1)
3546 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3547
3548 if (ValVT == MVT::v64i1) {
3549 // In 32 bit machine, this case is handled by getv64i1Argument
3550 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3550, __extension__
__PRETTY_FUNCTION__))
;
3551 // In 64 bit machine, There is no need to truncate the value only bitcast
3552 } else {
3553 MVT maskLen;
3554 switch (ValVT.getSimpleVT().SimpleTy) {
3555 case MVT::v8i1:
3556 maskLen = MVT::i8;
3557 break;
3558 case MVT::v16i1:
3559 maskLen = MVT::i16;
3560 break;
3561 case MVT::v32i1:
3562 maskLen = MVT::i32;
3563 break;
3564 default:
3565 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3565)
;
3566 }
3567
3568 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3569 }
3570 return DAG.getBitcast(ValVT, ValReturned);
3571}
3572
3573/// Lower the result values of a call into the
3574/// appropriate copies out of appropriate physical registers.
3575///
3576SDValue X86TargetLowering::LowerCallResult(
3577 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
3578 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3579 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3580 uint32_t *RegMask) const {
3581
3582 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3583 // Assign locations to each value returned by this call.
3584 SmallVector<CCValAssign, 16> RVLocs;
3585 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3586 *DAG.getContext());
3587 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3588
3589 // Copy all of the result registers out of their specified physreg.
3590 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3591 ++I, ++InsIndex) {
3592 CCValAssign &VA = RVLocs[I];
3593 EVT CopyVT = VA.getLocVT();
3594
3595 // In some calling conventions we need to remove the used registers
3596 // from the register mask.
3597 if (RegMask) {
3598 for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
3599 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
3600 }
3601
3602 // Report an error if there was an attempt to return FP values via XMM
3603 // registers.
3604 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3605 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3606 if (VA.getLocReg() == X86::XMM1)
3607 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3608 else
3609 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3610 } else if (!Subtarget.hasSSE2() &&
3611 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3612 CopyVT == MVT::f64) {
3613 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3614 if (VA.getLocReg() == X86::XMM1)
3615 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3616 else
3617 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3618 }
3619
3620 // If we prefer to use the value in xmm registers, copy it out as f80 and
3621 // use a truncate to move it from fp stack reg to xmm reg.
3622 bool RoundAfterCopy = false;
3623 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3624 isScalarFPTypeInSSEReg(VA.getValVT())) {
3625 if (!Subtarget.hasX87())
3626 report_fatal_error("X87 register return with X87 disabled");
3627 CopyVT = MVT::f80;
3628 RoundAfterCopy = (CopyVT != VA.getLocVT());
3629 }
3630
3631 SDValue Val;
3632 if (VA.needsCustom()) {
3633 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__
__PRETTY_FUNCTION__))
3634 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__
__PRETTY_FUNCTION__))
;
3635 Val =
3636 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
3637 } else {
3638 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
3639 .getValue(1);
3640 Val = Chain.getValue(0);
3641 InGlue = Chain.getValue(2);
3642 }
3643
3644 if (RoundAfterCopy)
3645 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3646 // This truncation won't change the value.
3647 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
3648
3649 if (VA.isExtInLoc()) {
3650 if (VA.getValVT().isVector() &&
3651 VA.getValVT().getScalarType() == MVT::i1 &&
3652 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3653 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3654 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3655 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3656 } else
3657 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3658 }
3659
3660 if (VA.getLocInfo() == CCValAssign::BCvt)
3661 Val = DAG.getBitcast(VA.getValVT(), Val);
3662
3663 InVals.push_back(Val);
3664 }
3665
3666 return Chain;
3667}
3668
3669//===----------------------------------------------------------------------===//
3670// C & StdCall & Fast Calling Convention implementation
3671//===----------------------------------------------------------------------===//
3672// StdCall calling convention seems to be standard for many Windows' API
3673// routines and around. It differs from C calling convention just a little:
3674// callee should clean up the stack, not caller. Symbols should be also
3675// decorated in some fancy way :) It doesn't support any vector arguments.
3676// For info on fast calling convention see Fast Calling Convention (tail call)
3677// implementation LowerX86_32FastCCCallTo.
3678
3679/// Determines whether Args, either a set of outgoing arguments to a call, or a
3680/// set of incoming args of a call, contains an sret pointer that the callee
3681/// pops
3682template <typename T>
3683static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3684 const X86Subtarget &Subtarget) {
3685 // Not C++20 (yet), so no concepts available.
3686 static_assert(std::is_same_v<T, ISD::OutputArg> ||
3687 std::is_same_v<T, ISD::InputArg>,
3688 "requires ISD::OutputArg or ISD::InputArg");
3689
3690 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
3691 // for most compilations.
3692 if (!Subtarget.is32Bit())
3693 return false;
3694
3695 if (Args.empty())
3696 return false;
3697
3698 // Most calls do not have an sret argument, check the arg next.
3699 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3700 if (!Flags.isSRet() || Flags.isInReg())
3701 return false;
3702
3703 // The MSVCabi does not pop the sret.
3704 if (Subtarget.getTargetTriple().isOSMSVCRT())
3705 return false;
3706
3707 // MCUs don't pop the sret
3708 if (Subtarget.isTargetMCU())
3709 return false;
3710
3711 // Callee pops argument
3712 return true;
3713}
3714
3715/// Make a copy of an aggregate at address specified by "Src" to address
3716/// "Dst" with size and alignment information specified by the specific
3717/// parameter attribute. The copy will be passed as a byval function parameter.
3718static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3719 SDValue Chain, ISD::ArgFlagsTy Flags,
3720 SelectionDAG &DAG, const SDLoc &dl) {
3721 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3722
3723 return DAG.getMemcpy(
3724 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3725 /*isVolatile*/ false, /*AlwaysInline=*/true,
3726 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3727}
3728
3729/// Return true if the calling convention is one that we can guarantee TCO for.
3730static bool canGuaranteeTCO(CallingConv::ID CC) {
3731 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3732 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3733 CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
3734}
3735
3736/// Return true if we might ever do TCO for calls with this calling convention.
3737static bool mayTailCallThisCC(CallingConv::ID CC) {
3738 switch (CC) {
3739 // C calling conventions:
3740 case CallingConv::C:
3741 case CallingConv::Win64:
3742 case CallingConv::X86_64_SysV:
3743 // Callee pop conventions:
3744 case CallingConv::X86_ThisCall:
3745 case CallingConv::X86_StdCall:
3746 case CallingConv::X86_VectorCall:
3747 case CallingConv::X86_FastCall:
3748 // Swift:
3749 case CallingConv::Swift:
3750 return true;
3751 default:
3752 return canGuaranteeTCO(CC);
3753 }
3754}
3755
3756/// Return true if the function is being made into a tailcall target by
3757/// changing its ABI.
3758static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3759 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3760 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3761}
3762
3763bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3764 if (!CI->isTailCall())
3765 return false;
3766
3767 CallingConv::ID CalleeCC = CI->getCallingConv();
3768 if (!mayTailCallThisCC(CalleeCC))
3769 return false;
3770
3771 return true;
3772}
3773
3774SDValue
3775X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3776 const SmallVectorImpl<ISD::InputArg> &Ins,
3777 const SDLoc &dl, SelectionDAG &DAG,
3778 const CCValAssign &VA,
3779 MachineFrameInfo &MFI, unsigned i) const {
3780 // Create the nodes corresponding to a load from this parameter slot.
3781 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3782 bool AlwaysUseMutable = shouldGuaranteeTCO(
3783 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3784 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3785 EVT ValVT;
3786 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3787
3788 // If value is passed by pointer we have address passed instead of the value
3789 // itself. No need to extend if the mask value and location share the same
3790 // absolute size.
3791 bool ExtendedInMem =
3792 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3793 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3794
3795 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3796 ValVT = VA.getLocVT();
3797 else
3798 ValVT = VA.getValVT();
3799
3800 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3801 // changed with more analysis.
3802 // In case of tail call optimization mark all arguments mutable. Since they
3803 // could be overwritten by lowering of arguments in case of a tail call.
3804 if (Flags.isByVal()) {
3805 unsigned Bytes = Flags.getByValSize();
3806 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3807
3808 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3809 // can be improved with deeper analysis.
3810 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3811 /*isAliased=*/true);
3812 return DAG.getFrameIndex(FI, PtrVT);
3813 }
3814
3815 EVT ArgVT = Ins[i].ArgVT;
3816
3817 // If this is a vector that has been split into multiple parts, and the
3818 // scalar size of the parts don't match the vector element size, then we can't
3819 // elide the copy. The parts will have padding between them instead of being
3820 // packed like a vector.
3821 bool ScalarizedAndExtendedVector =
3822 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3823 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3824
3825 // This is an argument in memory. We might be able to perform copy elision.
3826 // If the argument is passed directly in memory without any extension, then we
3827 // can perform copy elision. Large vector types, for example, may be passed
3828 // indirectly by pointer.
3829 if (Flags.isCopyElisionCandidate() &&
3830 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3831 !ScalarizedAndExtendedVector) {
3832 SDValue PartAddr;
3833 if (Ins[i].PartOffset == 0) {
3834 // If this is a one-part value or the first part of a multi-part value,
3835 // create a stack object for the entire argument value type and return a
3836 // load from our portion of it. This assumes that if the first part of an
3837 // argument is in memory, the rest will also be in memory.
3838 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3839 /*IsImmutable=*/false);
3840 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3841 return DAG.getLoad(
3842 ValVT, dl, Chain, PartAddr,
3843 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3844 } else {
3845 // This is not the first piece of an argument in memory. See if there is
3846 // already a fixed stack object including this offset. If so, assume it
3847 // was created by the PartOffset == 0 branch above and create a load from
3848 // the appropriate offset into it.
3849 int64_t PartBegin = VA.getLocMemOffset();
3850 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3851 int FI = MFI.getObjectIndexBegin();
3852 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3853 int64_t ObjBegin = MFI.getObjectOffset(FI);
3854 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3855 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3856 break;
3857 }
3858 if (MFI.isFixedObjectIndex(FI)) {
3859 SDValue Addr =
3860 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3861 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3862 return DAG.getLoad(
3863 ValVT, dl, Chain, Addr,
3864 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3865 Ins[i].PartOffset));
3866 }
3867 }
3868 }
3869
3870 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3871 VA.getLocMemOffset(), isImmutable);
3872
3873 // Set SExt or ZExt flag.
3874 if (VA.getLocInfo() == CCValAssign::ZExt) {
3875 MFI.setObjectZExt(FI, true);
3876 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3877 MFI.setObjectSExt(FI, true);
3878 }
3879
3880 MaybeAlign Alignment;
3881 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3882 ValVT != MVT::f80)
3883 Alignment = MaybeAlign(4);
3884 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3885 SDValue Val = DAG.getLoad(
3886 ValVT, dl, Chain, FIN,
3887 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3888 Alignment);
3889 return ExtendedInMem
3890 ? (VA.getValVT().isVector()
3891 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3892 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3893 : Val;
3894}
3895
3896// FIXME: Get this from tablegen.
3897static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3898 const X86Subtarget &Subtarget) {
3899 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3899, __extension__ __PRETTY_FUNCTION__))
;
3900
3901 if (Subtarget.isCallingConvWin64(CallConv)) {
3902 static const MCPhysReg GPR64ArgRegsWin64[] = {
3903 X86::RCX, X86::RDX, X86::R8, X86::R9
3904 };
3905 return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3906 }
3907
3908 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3909 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3910 };
3911 return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3912}
3913
3914// FIXME: Get this from tablegen.
3915static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3916 CallingConv::ID CallConv,
3917 const X86Subtarget &Subtarget) {
3918 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3918, __extension__ __PRETTY_FUNCTION__))
;
3919 if (Subtarget.isCallingConvWin64(CallConv)) {
3920 // The XMM registers which might contain var arg parameters are shadowed
3921 // in their paired GPR. So we only need to save the GPR to their home
3922 // slots.
3923 // TODO: __vectorcall will change this.
3924 return std::nullopt;
3925 }
3926
3927 bool isSoftFloat = Subtarget.useSoftFloat();
3928 if (isSoftFloat || !Subtarget.hasSSE1())
3929 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3930 // registers.
3931 return std::nullopt;
3932
3933 static const MCPhysReg XMMArgRegs64Bit[] = {
3934 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3935 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3936 };
3937 return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3938}
3939
3940#ifndef NDEBUG
3941static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3942 return llvm::is_sorted(
3943 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3944 return A.getValNo() < B.getValNo();
3945 });
3946}
3947#endif
3948
3949namespace {
3950/// This is a helper class for lowering variable arguments parameters.
3951class VarArgsLoweringHelper {
3952public:
3953 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3954 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3955 CallingConv::ID CallConv, CCState &CCInfo)
3956 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3957 TheMachineFunction(DAG.getMachineFunction()),
3958 TheFunction(TheMachineFunction.getFunction()),
3959 FrameInfo(TheMachineFunction.getFrameInfo()),
3960 FrameLowering(*Subtarget.getFrameLowering()),
3961 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3962 CCInfo(CCInfo) {}
3963
3964 // Lower variable arguments parameters.
3965 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3966
3967private:
3968 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3969
3970 void forwardMustTailParameters(SDValue &Chain);
3971
3972 bool is64Bit() const { return Subtarget.is64Bit(); }
3973 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3974
3975 X86MachineFunctionInfo *FuncInfo;
3976 const SDLoc &DL;
3977 SelectionDAG &DAG;
3978 const X86Subtarget &Subtarget;
3979 MachineFunction &TheMachineFunction;
3980 const Function &TheFunction;
3981 MachineFrameInfo &FrameInfo;
3982 const TargetFrameLowering &FrameLowering;
3983 const TargetLowering &TargLowering;
3984 CallingConv::ID CallConv;
3985 CCState &CCInfo;
3986};
3987} // namespace
3988
3989void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3990 SDValue &Chain, unsigned StackSize) {
3991 // If the function takes variable number of arguments, make a frame index for
3992 // the start of the first vararg value... for expansion of llvm.va_start. We
3993 // can skip this if there are no va_start calls.
3994 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3995 CallConv != CallingConv::X86_ThisCall)) {
3996 FuncInfo->setVarArgsFrameIndex(
3997 FrameInfo.CreateFixedObject(1, StackSize, true));
3998 }
3999
4000 // 64-bit calling conventions support varargs and register parameters, so we
4001 // have to do extra work to spill them in the prologue.
4002 if (is64Bit()) {
4003 // Find the first unallocated argument registers.
4004 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
4005 ArrayRef<MCPhysReg> ArgXMMs =
4006 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
4007 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
4008 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
4009
4010 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__
__PRETTY_FUNCTION__))
4011 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__
__PRETTY_FUNCTION__))
;
4012
4013 if (isWin64()) {
4014 // Get to the caller-allocated home save location. Add 8 to account
4015 // for the return address.
4016 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
4017 FuncInfo->setRegSaveFrameIndex(
4018 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
4019 // Fixup to set vararg frame on shadow area (4 x i64).
4020 if (NumIntRegs < 4)
4021 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
4022 } else {
4023 // For X86-64, if there are vararg parameters that are passed via
4024 // registers, then we must store them to their spots on the stack so
4025 // they may be loaded by dereferencing the result of va_next.
4026 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
4027 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
4028 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
4029 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
4030 }
4031
4032 SmallVector<SDValue, 6>
4033 LiveGPRs; // list of SDValue for GPR registers keeping live input value
4034 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
4035 // keeping live input value
4036 SDValue ALVal; // if applicable keeps SDValue for %al register
4037
4038 // Gather all the live in physical registers.
4039 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
4040 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
4041 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
4042 }
4043 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
4044 if (!AvailableXmms.empty()) {
4045 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4046 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
4047 for (MCPhysReg Reg : AvailableXmms) {
4048 // FastRegisterAllocator spills virtual registers at basic
4049 // block boundary. That leads to usages of xmm registers
4050 // outside of check for %al. Pass physical registers to
4051 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
4052 TheMachineFunction.getRegInfo().addLiveIn(Reg);
4053 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
4054 }
4055 }
4056
4057 // Store the integer parameter registers.
4058 SmallVector<SDValue, 8> MemOps;
4059 SDValue RSFIN =
4060 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
4061 TargLowering.getPointerTy(DAG.getDataLayout()));
4062 unsigned Offset = FuncInfo->getVarArgsGPOffset();
4063 for (SDValue Val : LiveGPRs) {
4064 SDValue FIN = DAG.getNode(ISD::ADD, DL,
4065 TargLowering.getPointerTy(DAG.getDataLayout()),
4066 RSFIN, DAG.getIntPtrConstant(Offset, DL));
4067 SDValue Store =
4068 DAG.getStore(Val.getValue(1), DL, Val, FIN,
4069 MachinePointerInfo::getFixedStack(
4070 DAG.getMachineFunction(),
4071 FuncInfo->getRegSaveFrameIndex(), Offset));
4072 MemOps.push_back(Store);
4073 Offset += 8;
4074 }
4075
4076 // Now store the XMM (fp + vector) parameter registers.
4077 if (!LiveXMMRegs.empty()) {
4078 SmallVector<SDValue, 12> SaveXMMOps;
4079 SaveXMMOps.push_back(Chain);
4080 SaveXMMOps.push_back(ALVal);
4081 SaveXMMOps.push_back(RSFIN);
4082 SaveXMMOps.push_back(
4083 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
4084 llvm::append_range(SaveXMMOps, LiveXMMRegs);
4085 MachineMemOperand *StoreMMO =
4086 DAG.getMachineFunction().getMachineMemOperand(
4087 MachinePointerInfo::getFixedStack(
4088 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
4089 Offset),
4090 MachineMemOperand::MOStore, 128, Align(16));
4091 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
4092 DL, DAG.getVTList(MVT::Other),
4093 SaveXMMOps, MVT::i8, StoreMMO));
4094 }
4095
4096 if (!MemOps.empty())
4097 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4098 }
4099}
4100
4101void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
4102 // Find the largest legal vector type.
4103 MVT VecVT = MVT::Other;
4104 // FIXME: Only some x86_32 calling conventions support AVX512.
4105 if (Subtarget.useAVX512Regs() &&
4106 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
4107 CallConv == CallingConv::Intel_OCL_BI)))
4108 VecVT = MVT::v16f32;
4109 else if (Subtarget.hasAVX())
4110 VecVT = MVT::v8f32;
4111 else if (Subtarget.hasSSE2())
4112 VecVT = MVT::v4f32;
4113
4114 // We forward some GPRs and some vector types.
4115 SmallVector<MVT, 2> RegParmTypes;
4116 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
4117 RegParmTypes.push_back(IntVT);
4118 if (VecVT != MVT::Other)
4119 RegParmTypes.push_back(VecVT);
4120
4121 // Compute the set of forwarded registers. The rest are scratch.
4122 SmallVectorImpl<ForwardedRegister> &Forwards =
4123 FuncInfo->getForwardedMustTailRegParms();
4124 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
4125
4126 // Forward AL for SysV x86_64 targets, since it is used for varargs.
4127 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
4128 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4129 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
4130 }
4131
4132 // Copy all forwards from physical to virtual registers.
4133 for (ForwardedRegister &FR : Forwards) {
4134 // FIXME: Can we use a less constrained schedule?
4135 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
4136 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
4137 TargLowering.getRegClassFor(FR.VT));
4138 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
4139 }
4140}
4141
4142void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
4143 unsigned StackSize) {
4144 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
4145 // If necessary, it would be set into the correct value later.
4146 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
4147 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4148
4149 if (FrameInfo.hasVAStart())
4150 createVarArgAreaAndStoreRegisters(Chain, StackSize);
4151
4152 if (FrameInfo.hasMustTailInVarArgFunc())
4153 forwardMustTailParameters(Chain);
4154}
4155
4156SDValue X86TargetLowering::LowerFormalArguments(
4157 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
4158 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4159 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4160 MachineFunction &MF = DAG.getMachineFunction();
4161 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4162
4163 const Function &F = MF.getFunction();
4164 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
4165 F.getName() == "main")
4166 FuncInfo->setForceFramePointer(true);
4167
4168 MachineFrameInfo &MFI = MF.getFrameInfo();
4169 bool Is64Bit = Subtarget.is64Bit();
4170 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4171
4172 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))
4173 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))
4174 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))
;
4175
4176 // Assign locations to all of the incoming arguments.
4177 SmallVector<CCValAssign, 16> ArgLocs;
4178 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4179
4180 // Allocate shadow area for Win64.
4181 if (IsWin64)
4182 CCInfo.AllocateStack(32, Align(8));
4183
4184 CCInfo.AnalyzeArguments(Ins, CC_X86);
4185
4186 // In vectorcall calling convention a second pass is required for the HVA
4187 // types.
4188 if (CallingConv::X86_VectorCall == CallConv) {
4189 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
4190 }
4191
4192 // The next loop assumes that the locations are in the same order of the
4193 // input arguments.
4194 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__))
4195 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__))
;
4196
4197 SDValue ArgValue;
4198 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
4199 ++I, ++InsIndex) {
4200 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4200, __extension__
__PRETTY_FUNCTION__))
;
4201 CCValAssign &VA = ArgLocs[I];
4202
4203 if (VA.isRegLoc()) {
4204 EVT RegVT = VA.getLocVT();
4205 if (VA.needsCustom()) {
4206 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))
4207 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))
4208 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))
;
4209
4210 // v64i1 values, in regcall calling convention, that are
4211 // compiled to 32 bit arch, are split up into two registers.
4212 ArgValue =
4213 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
4214 } else {
4215 const TargetRegisterClass *RC;
4216 if (RegVT == MVT::i8)
4217 RC = &X86::GR8RegClass;
4218 else if (RegVT == MVT::i16)
4219 RC = &X86::GR16RegClass;
4220 else if (RegVT == MVT::i32)
4221 RC = &X86::GR32RegClass;
4222 else if (Is64Bit && RegVT == MVT::i64)
4223 RC = &X86::GR64RegClass;
4224 else if (RegVT == MVT::f16)
4225 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
4226 else if (RegVT == MVT::f32)
4227 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
4228 else if (RegVT == MVT::f64)
4229 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
4230 else if (RegVT == MVT::f80)
4231 RC = &X86::RFP80RegClass;
4232 else if (RegVT == MVT::f128)
4233 RC = &X86::VR128RegClass;
4234 else if (RegVT.is512BitVector())
4235 RC = &X86::VR512RegClass;
4236 else if (RegVT.is256BitVector())
4237 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
4238 else if (RegVT.is128BitVector())
4239 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
4240 else if (RegVT == MVT::x86mmx)
4241 RC = &X86::VR64RegClass;
4242 else if (RegVT == MVT::v1i1)
4243 RC = &X86::VK1RegClass;
4244 else if (RegVT == MVT::v8i1)
4245 RC = &X86::VK8RegClass;
4246 else if (RegVT == MVT::v16i1)
4247 RC = &X86::VK16RegClass;
4248 else if (RegVT == MVT::v32i1)
4249 RC = &X86::VK32RegClass;
4250 else if (RegVT == MVT::v64i1)
4251 RC = &X86::VK64RegClass;
4252 else
4253 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4253)
;
4254
4255 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4256 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4257 }
4258
4259 // If this is an 8 or 16-bit value, it is really passed promoted to 32
4260 // bits. Insert an assert[sz]ext to capture this, then truncate to the
4261 // right size.
4262 if (VA.getLocInfo() == CCValAssign::SExt)
4263 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4264 DAG.getValueType(VA.getValVT()));
4265 else if (VA.getLocInfo() == CCValAssign::ZExt)
4266 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4267 DAG.getValueType(VA.getValVT()));
4268 else if (VA.getLocInfo() == CCValAssign::BCvt)
4269 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
4270
4271 if (VA.isExtInLoc()) {
4272 // Handle MMX values passed in XMM regs.
4273 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
4274 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
4275 else if (VA.getValVT().isVector() &&
4276 VA.getValVT().getScalarType() == MVT::i1 &&
4277 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
4278 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
4279 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
4280 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
4281 } else
4282 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4283 }
4284 } else {
4285 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4285, __extension__ __PRETTY_FUNCTION__))
;
4286 ArgValue =
4287 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
4288 }
4289
4290 // If value is passed via pointer - do a load.
4291 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
4292 ArgValue =
4293 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
4294
4295 InVals.push_back(ArgValue);
4296 }
4297
4298 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4299 if (Ins[I].Flags.isSwiftAsync()) {
4300 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
4301 if (Subtarget.is64Bit())
4302 X86FI->setHasSwiftAsyncContext(true);
4303 else {
4304 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
4305 X86FI->setSwiftAsyncContextFrameIdx(FI);
4306 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
4307 DAG.getFrameIndex(FI, MVT::i32),
4308 MachinePointerInfo::getFixedStack(MF, FI));
4309 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4310 }
4311 }
4312
4313 // Swift calling convention does not require we copy the sret argument
4314 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4315 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4316 continue;
4317
4318 // All x86 ABIs require that for returning structs by value we copy the
4319 // sret argument into %rax/%eax (depending on ABI) for the return. Save
4320 // the argument into a virtual register so that we can access it from the
4321 // return points.
4322 if (Ins[I].Flags.isSRet()) {
4323 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__
__PRETTY_FUNCTION__))
4324 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__
__PRETTY_FUNCTION__))
;
4325 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4326 Register Reg =
4327 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4328 FuncInfo->setSRetReturnReg(Reg);
4329 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4330 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4331 break;
4332 }
4333 }
4334
4335 unsigned StackSize = CCInfo.getNextStackOffset();
4336 // Align stack specially for tail calls.
4337 if (shouldGuaranteeTCO(CallConv,
4338 MF.getTarget().Options.GuaranteedTailCallOpt))
4339 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4340
4341 if (IsVarArg)
4342 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4343 .lowerVarArgsParameters(Chain, StackSize);
4344
4345 // Some CCs need callee pop.
4346 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4347 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4348 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4349 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4350 // X86 interrupts must pop the error code (and the alignment padding) if
4351 // present.
4352 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4353 } else {
4354 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4355 // If this is an sret function, the return should pop the hidden pointer.
4356 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4357 FuncInfo->setBytesToPopOnReturn(4);
4358 }
4359
4360 if (!Is64Bit) {
4361 // RegSaveFrameIndex is X86-64 only.
4362 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4363 }
4364
4365 FuncInfo->setArgumentStackSize(StackSize);
4366
4367 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4368 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4369 if (Personality == EHPersonality::CoreCLR) {
4370 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4370,
__extension__ __PRETTY_FUNCTION__))
;
4371 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4372 // that we'd prefer this slot be allocated towards the bottom of the frame
4373 // (i.e. near the stack pointer after allocating the frame). Every
4374 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4375 // offset from the bottom of this and each funclet's frame must be the
4376 // same, so the size of funclets' (mostly empty) frames is dictated by
4377 // how far this slot is from the bottom (since they allocate just enough
4378 // space to accommodate holding this slot at the correct offset).
4379 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4380 EHInfo->PSPSymFrameIdx = PSPSymFI;
4381 }
4382 }
4383
4384 if (shouldDisableArgRegFromCSR(CallConv) ||
4385 F.hasFnAttribute("no_caller_saved_registers")) {
4386 MachineRegisterInfo &MRI = MF.getRegInfo();
4387 for (std::pair<Register, Register> Pair : MRI.liveins())
4388 MRI.disableCalleeSavedRegister(Pair.first);
4389 }
4390
4391 return Chain;
4392}
4393
4394SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4395 SDValue Arg, const SDLoc &dl,
4396 SelectionDAG &DAG,
4397 const CCValAssign &VA,
4398 ISD::ArgFlagsTy Flags,
4399 bool isByVal) const {
4400 unsigned LocMemOffset = VA.getLocMemOffset();
4401 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4402 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4403 StackPtr, PtrOff);
4404 if (isByVal)
4405 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4406
4407 MaybeAlign Alignment;
4408 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4409 Arg.getSimpleValueType() != MVT::f80)
4410 Alignment = MaybeAlign(4);
4411 return DAG.getStore(
4412 Chain, dl, Arg, PtrOff,
4413 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4414 Alignment);
4415}
4416
4417/// Emit a load of return address if tail call
4418/// optimization is performed and it is required.
4419SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4420 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4421 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4422 // Adjust the Return address stack slot.
4423 EVT VT = getPointerTy(DAG.getDataLayout());
4424 OutRetAddr = getReturnAddressFrameIndex(DAG);
4425
4426 // Load the "old" Return address.
4427 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4428 return SDValue(OutRetAddr.getNode(), 1);
4429}
4430
4431/// Emit a store of the return address if tail call
4432/// optimization is performed and it is required (FPDiff!=0).
4433static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4434 SDValue Chain, SDValue RetAddrFrIdx,
4435 EVT PtrVT, unsigned SlotSize,
4436 int FPDiff, const SDLoc &dl) {
4437 // Store the return address to the appropriate stack slot.
4438 if (!FPDiff) return Chain;
4439 // Calculate the new stack slot for the return address.
4440 int NewReturnAddrFI =
4441 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4442 false);
4443 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4444 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4445 MachinePointerInfo::getFixedStack(
4446 DAG.getMachineFunction(), NewReturnAddrFI));
4447 return Chain;
4448}
4449
4450/// Returns a vector_shuffle mask for an movs{s|d}, movd
4451/// operation of specified width.
4452static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4453 SDValue V2) {
4454 unsigned NumElems = VT.getVectorNumElements();
4455 SmallVector<int, 8> Mask;
4456 Mask.push_back(NumElems);
4457 for (unsigned i = 1; i != NumElems; ++i)
4458 Mask.push_back(i);
4459 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4460}
4461
4462SDValue
4463X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4464 SmallVectorImpl<SDValue> &InVals) const {
4465 SelectionDAG &DAG = CLI.DAG;
4466 SDLoc &dl = CLI.DL;
4467 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4468 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4469 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4470 SDValue Chain = CLI.Chain;
4471 SDValue Callee = CLI.Callee;
4472 CallingConv::ID CallConv = CLI.CallConv;
4473 bool &isTailCall = CLI.IsTailCall;
4474 bool isVarArg = CLI.IsVarArg;
4475 const auto *CB = CLI.CB;
4476
4477 MachineFunction &MF = DAG.getMachineFunction();
4478 bool Is64Bit = Subtarget.is64Bit();
4479 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4480 bool IsSibcall = false;
4481 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4482 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4483 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4484 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4485 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4486 CB->hasFnAttr("no_caller_saved_registers"));
4487 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4488 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4489 bool IsCFICall = IsIndirectCall && CLI.CFIType;
4490 const Module *M = MF.getMMI().getModule();
4491 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4492
4493 MachineFunction::CallSiteInfo CSInfo;
4494 if (CallConv == CallingConv::X86_INTR)
4495 report_fatal_error("X86 interrupts may not be called directly");
4496
4497 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4498 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4499 // If we are using a GOT, disable tail calls to external symbols with
4500 // default visibility. Tail calling such a symbol requires using a GOT
4501 // relocation, which forces early binding of the symbol. This breaks code
4502 // that require lazy function symbol resolution. Using musttail or
4503 // GuaranteedTailCallOpt will override this.
4504 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4505 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4506 G->getGlobal()->hasDefaultVisibility()))
4507 isTailCall = false;
4508 }
4509
4510 if (isTailCall && !IsMustTail) {
4511 // Check if it's really possible to do a tail call.
4512 isTailCall = IsEligibleForTailCallOptimization(
4513 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4514 Ins, DAG);
4515
4516 // Sibcalls are automatically detected tailcalls which do not require
4517 // ABI changes.
4518 if (!IsGuaranteeTCO && isTailCall)
4519 IsSibcall = true;
4520
4521 if (isTailCall)
4522 ++NumTailCalls;
4523 }
4524
4525 if (IsMustTail && !isTailCall)
4526 report_fatal_error("failed to perform tail call elimination on a call "
4527 "site marked musttail");
4528
4529 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__
__PRETTY_FUNCTION__))
4530 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__
__PRETTY_FUNCTION__))
;
4531
4532 // Analyze operands of the call, assigning locations to each operand.
4533 SmallVector<CCValAssign, 16> ArgLocs;
4534 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4535
4536 // Allocate shadow area for Win64.
4537 if (IsWin64)
4538 CCInfo.AllocateStack(32, Align(8));
4539
4540 CCInfo.AnalyzeArguments(Outs, CC_X86);
4541
4542 // In vectorcall calling convention a second pass is required for the HVA
4543 // types.
4544 if (CallingConv::X86_VectorCall == CallConv) {
4545 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4546 }
4547
4548 // Get a count of how many bytes are to be pushed on the stack.
4549 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4550 if (IsSibcall)
4551 // This is a sibcall. The memory operands are available in caller's
4552 // own caller's stack.
4553 NumBytes = 0;
4554 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4555 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4556
4557 int FPDiff = 0;
4558 if (isTailCall &&
4559 shouldGuaranteeTCO(CallConv,
4560 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4561 // Lower arguments at fp - stackoffset + fpdiff.
4562 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4563
4564 FPDiff = NumBytesCallerPushed - NumBytes;
4565
4566 // Set the delta of movement of the returnaddr stackslot.
4567 // But only set if delta is greater than previous delta.
4568 if (FPDiff < X86Info->getTCReturnAddrDelta())
4569 X86Info->setTCReturnAddrDelta(FPDiff);
4570 }
4571
4572 unsigned NumBytesToPush = NumBytes;
4573 unsigned NumBytesToPop = NumBytes;
4574
4575 // If we have an inalloca argument, all stack space has already been allocated
4576 // for us and be right at the top of the stack. We don't support multiple
4577 // arguments passed in memory when using inalloca.
4578 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4579 NumBytesToPush = 0;
4580 if (!ArgLocs.back().isMemLoc())
4581 report_fatal_error("cannot use inalloca attribute on a register "
4582 "parameter");
4583 if (ArgLocs.back().getLocMemOffset() != 0)
4584 report_fatal_error("any parameter with the inalloca attribute must be "
4585 "the only memory argument");
4586 } else if (CLI.IsPreallocated) {
4587 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))
4588 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))
4589 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))
;
4590 SmallVector<size_t, 4> PreallocatedOffsets;
4591 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4592 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4593 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4594 }
4595 }
4596 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4597 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4598 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4599 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4600 NumBytesToPush = 0;
4601 }
4602
4603 if (!IsSibcall && !IsMustTail)
4604 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4605 NumBytes - NumBytesToPush, dl);
4606
4607 SDValue RetAddrFrIdx;
4608 // Load return address for tail calls.
4609 if (isTailCall && FPDiff)
4610 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4611 Is64Bit, FPDiff, dl);
4612
4613 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4614 SmallVector<SDValue, 8> MemOpChains;
4615 SDValue StackPtr;
4616
4617 // The next loop assumes that the locations are in the same order of the
4618 // input arguments.
4619 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__
__PRETTY_FUNCTION__))
4620 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__
__PRETTY_FUNCTION__))
;
4621
4622 // Walk the register/memloc assignments, inserting copies/loads. In the case
4623 // of tail call optimization arguments are handle later.
4624 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4625 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4626 ++I, ++OutIndex) {
4627 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4627, __extension__
__PRETTY_FUNCTION__))
;
4628 // Skip inalloca/preallocated arguments, they have already been written.
4629 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4630 if (Flags.isInAlloca() || Flags.isPreallocated())
4631 continue;
4632
4633 CCValAssign &VA = ArgLocs[I];
4634 EVT RegVT = VA.getLocVT();
4635 SDValue Arg = OutVals[OutIndex];
4636 bool isByVal = Flags.isByVal();
4637
4638 // Promote the value if needed.
4639 switch (VA.getLocInfo()) {
4640 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4640)
;
4641 case CCValAssign::Full: break;
4642 case CCValAssign::SExt:
4643 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4644 break;
4645 case CCValAssign::ZExt:
4646 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4647 break;
4648 case CCValAssign::AExt:
4649 if (Arg.getValueType().isVector() &&
4650 Arg.getValueType().getVectorElementType() == MVT::i1)
4651 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4652 else if (RegVT.is128BitVector()) {
4653 // Special case: passing MMX values in XMM registers.
4654 Arg = DAG.getBitcast(MVT::i64, Arg);
4655 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4656 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4657 } else
4658 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4659 break;
4660 case CCValAssign::BCvt:
4661 Arg = DAG.getBitcast(RegVT, Arg);
4662 break;
4663 case CCValAssign::Indirect: {
4664 if (isByVal) {
4665 // Memcpy the argument to a temporary stack slot to prevent
4666 // the caller from seeing any modifications the callee may make
4667 // as guaranteed by the `byval` attribute.
4668 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4669 Flags.getByValSize(),
4670 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4671 SDValue StackSlot =
4672 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4673 Chain =
4674 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4675 // From now on treat this as a regular pointer
4676 Arg = StackSlot;
4677 isByVal = false;
4678 } else {
4679 // Store the argument.
4680 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4681 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4682 Chain = DAG.getStore(
4683 Chain, dl, Arg, SpillSlot,
4684 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4685 Arg = SpillSlot;
4686 }
4687 break;
4688 }
4689 }
4690
4691 if (VA.needsCustom()) {
4692 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__
__PRETTY_FUNCTION__))
4693 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__
__PRETTY_FUNCTION__))
;
4694 // Split v64i1 value into two registers
4695 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4696 } else if (VA.isRegLoc()) {
4697 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4698 const TargetOptions &Options = DAG.getTarget().Options;
4699 if (Options.EmitCallSiteInfo)
4700 CSInfo.emplace_back(VA.getLocReg(), I);
4701 if (isVarArg && IsWin64) {
4702 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4703 // shadow reg if callee is a varargs function.
4704 Register ShadowReg;
4705 switch (VA.getLocReg()) {
4706 case X86::XMM0: ShadowReg = X86::RCX; break;
4707 case X86::XMM1: ShadowReg = X86::RDX; break;
4708 case X86::XMM2: ShadowReg = X86::R8; break;
4709 case X86::XMM3: ShadowReg = X86::R9; break;
4710 }
4711 if (ShadowReg)
4712 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4713 }
4714 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4715 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4715, __extension__ __PRETTY_FUNCTION__))
;
4716 if (!StackPtr.getNode())
4717 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4718 getPointerTy(DAG.getDataLayout()));
4719 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4720 dl, DAG, VA, Flags, isByVal));
4721 }
4722 }
4723
4724 if (!MemOpChains.empty())
4725 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4726
4727 if (Subtarget.isPICStyleGOT()) {
4728 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4729 // GOT pointer (except regcall).
4730 if (!isTailCall) {
4731 // Indirect call with RegCall calling convertion may use up all the
4732 // general registers, so it is not suitable to bind EBX reister for
4733 // GOT address, just let register allocator handle it.
4734 if (CallConv != CallingConv::X86_RegCall)
4735 RegsToPass.push_back(std::make_pair(
4736 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4737 getPointerTy(DAG.getDataLayout()))));
4738 } else {
4739 // If we are tail calling and generating PIC/GOT style code load the
4740 // address of the callee into ECX. The value in ecx is used as target of
4741 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4742 // for tail calls on PIC/GOT architectures. Normally we would just put the
4743 // address of GOT into ebx and then call target@PLT. But for tail calls
4744 // ebx would be restored (since ebx is callee saved) before jumping to the
4745 // target@PLT.
4746
4747 // Note: The actual moving to ECX is done further down.
4748 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4749 if (G && !G->getGlobal()->hasLocalLinkage() &&
4750 G->getGlobal()->hasDefaultVisibility())
4751 Callee = LowerGlobalAddress(Callee, DAG);
4752 else if (isa<ExternalSymbolSDNode>(Callee))
4753 Callee = LowerExternalSymbol(Callee, DAG);
4754 }
4755 }
4756
4757 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4758 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4759 // From AMD64 ABI document:
4760 // For calls that may call functions that use varargs or stdargs
4761 // (prototype-less calls or calls to functions containing ellipsis (...) in
4762 // the declaration) %al is used as hidden argument to specify the number
4763 // of SSE registers used. The contents of %al do not need to match exactly
4764 // the number of registers, but must be an ubound on the number of SSE
4765 // registers used and is in the range 0 - 8 inclusive.
4766
4767 // Count the number of XMM registers allocated.
4768 static const MCPhysReg XMMArgRegs[] = {
4769 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4770 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4771 };
4772 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4773 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__
__PRETTY_FUNCTION__))
4774 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__
__PRETTY_FUNCTION__))
;
4775 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4776 DAG.getConstant(NumXMMRegs, dl,
4777 MVT::i8)));
4778 }
4779
4780 if (isVarArg && IsMustTail) {
4781 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4782 for (const auto &F : Forwards) {
4783 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4784 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4785 }
4786 }
4787
4788 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4789 // don't need this because the eligibility check rejects calls that require
4790 // shuffling arguments passed in memory.
4791 if (!IsSibcall && isTailCall) {
4792 // Force all the incoming stack arguments to be loaded from the stack
4793 // before any new outgoing arguments are stored to the stack, because the
4794 // outgoing stack slots may alias the incoming argument stack slots, and
4795 // the alias isn't otherwise explicit. This is slightly more conservative
4796 // than necessary, because it means that each store effectively depends
4797 // on every argument instead of just those arguments it would clobber.
4798 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4799
4800 SmallVector<SDValue, 8> MemOpChains2;
4801 SDValue FIN;
4802 int FI = 0;
4803 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4804 ++I, ++OutsIndex) {
4805 CCValAssign &VA = ArgLocs[I];
4806
4807 if (VA.isRegLoc()) {
4808 if (VA.needsCustom()) {
4809 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__
__PRETTY_FUNCTION__))
4810 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__
__PRETTY_FUNCTION__))
;
4811 // This means that we are in special case where one argument was
4812 // passed through two register locations - Skip the next location
4813 ++I;
4814 }
4815
4816 continue;
4817 }
4818
4819 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4819, __extension__ __PRETTY_FUNCTION__))
;
4820 SDValue Arg = OutVals[OutsIndex];
4821 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4822 // Skip inalloca/preallocated arguments. They don't require any work.
4823 if (Flags.isInAlloca() || Flags.isPreallocated())
4824 continue;
4825 // Create frame index.
4826 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4827 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4828 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4829 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4830
4831 if (Flags.isByVal()) {
4832 // Copy relative to framepointer.
4833 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4834 if (!StackPtr.getNode())
4835 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4836 getPointerTy(DAG.getDataLayout()));
4837 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4838 StackPtr, Source);
4839
4840 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4841 ArgChain,
4842 Flags, DAG, dl));
4843 } else {
4844 // Store relative to framepointer.
4845 MemOpChains2.push_back(DAG.getStore(
4846 ArgChain, dl, Arg, FIN,
4847 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4848 }
4849 }
4850
4851 if (!MemOpChains2.empty())
4852 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4853
4854 // Store the return address to the appropriate stack slot.
4855 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4856 getPointerTy(DAG.getDataLayout()),
4857 RegInfo->getSlotSize(), FPDiff, dl);
4858 }
4859
4860 // Build a sequence of copy-to-reg nodes chained together with token chain
4861 // and glue operands which copy the outgoing args into registers.
4862 SDValue InGlue;
4863 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4864 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4865 RegsToPass[i].second, InGlue);
4866 InGlue = Chain.getValue(1);
4867 }
4868
4869 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4870 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4870, __extension__
__PRETTY_FUNCTION__))
;
4871 // In the 64-bit large code model, we have to make all calls
4872 // through a register, since the call instruction's 32-bit
4873 // pc-relative offset may not be large enough to hold the whole
4874 // address.
4875 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4876 Callee->getOpcode() == ISD::ExternalSymbol) {
4877 // Lower direct calls to global addresses and external symbols. Setting
4878 // ForCall to true here has the effect of removing WrapperRIP when possible
4879 // to allow direct calls to be selected without first materializing the
4880 // address into a register.
4881 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4882 } else if (Subtarget.isTarget64BitILP32() &&
4883 Callee.getValueType() == MVT::i32) {
4884 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4885 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4886 }
4887
4888 // Returns a chain & a glue for retval copy to use.
4889 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4890 SmallVector<SDValue, 8> Ops;
4891
4892 if (!IsSibcall && isTailCall && !IsMustTail) {
4893 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
4894 InGlue = Chain.getValue(1);
4895 }
4896
4897 Ops.push_back(Chain);
4898 Ops.push_back(Callee);
4899
4900 if (isTailCall)
4901 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4902
4903 // Add argument registers to the end of the list so that they are known live
4904 // into the call.
4905 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4906 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4907 RegsToPass[i].second.getValueType()));
4908
4909 // Add a register mask operand representing the call-preserved registers.
4910 const uint32_t *Mask = [&]() {
4911 auto AdaptedCC = CallConv;
4912 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4913 // use X86_INTR calling convention because it has the same CSR mask
4914 // (same preserved registers).
4915 if (HasNCSR)
4916 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4917 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4918 // to use the CSR_NoRegs_RegMask.
4919 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4920 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4921 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4922 }();
4923 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4923, __extension__
__PRETTY_FUNCTION__))
;
4924
4925 // If this is an invoke in a 32-bit function using a funclet-based
4926 // personality, assume the function clobbers all registers. If an exception
4927 // is thrown, the runtime will not restore CSRs.
4928 // FIXME: Model this more precisely so that we can register allocate across
4929 // the normal edge and spill and fill across the exceptional edge.
4930 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4931 const Function &CallerFn = MF.getFunction();
4932 EHPersonality Pers =
4933 CallerFn.hasPersonalityFn()
4934 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4935 : EHPersonality::Unknown;
4936 if (isFuncletEHPersonality(Pers))
4937 Mask = RegInfo->getNoPreservedMask();
4938 }
4939
4940 // Define a new register mask from the existing mask.
4941 uint32_t *RegMask = nullptr;
4942
4943 // In some calling conventions we need to remove the used physical registers
4944 // from the reg mask. Create a new RegMask for such calling conventions.
4945 // RegMask for calling conventions that disable only return registers (e.g.
4946 // preserve_most) will be modified later in LowerCallResult.
4947 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
4948 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
4949 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4950
4951 // Allocate a new Reg Mask and copy Mask.
4952 RegMask = MF.allocateRegMask();
4953 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4954 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4955
4956 // Make sure all sub registers of the argument registers are reset
4957 // in the RegMask.
4958 if (ShouldDisableArgRegs) {
4959 for (auto const &RegPair : RegsToPass)
4960 for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
4961 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
4962 }
4963
4964 // Create the RegMask Operand according to our updated mask.
4965 Ops.push_back(DAG.getRegisterMask(RegMask));
4966 } else {
4967 // Create the RegMask Operand according to the static mask.
4968 Ops.push_back(DAG.getRegisterMask(Mask));
4969 }
4970
4971 if (InGlue.getNode())
4972 Ops.push_back(InGlue);
4973
4974 if (isTailCall) {
4975 // We used to do:
4976 //// If this is the first return lowered for this function, add the regs
4977 //// to the liveout set for the function.
4978 // This isn't right, although it's probably harmless on x86; liveouts
4979 // should be computed from returns not tail calls. Consider a void
4980 // function making a tail call to a function returning int.
4981 MF.getFrameInfo().setHasTailCall();
4982 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4983
4984 if (IsCFICall)
4985 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4986
4987 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4988 return Ret;
4989 }
4990
4991 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4992 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4993 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4994 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4995 // expanded to the call, directly followed by a special marker sequence and
4996 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4997 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__
__PRETTY_FUNCTION__))
4998 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__
__PRETTY_FUNCTION__))
;
4999 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4999, __extension__
__PRETTY_FUNCTION__))
;
5000
5001 // Add a target global address for the retainRV/claimRV runtime function
5002 // just before the call target.
5003 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
5004 auto PtrVT = getPointerTy(DAG.getDataLayout());
5005 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
5006 Ops.insert(Ops.begin() + 1, GA);
5007 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
5008 } else {
5009 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
5010 }
5011
5012 if (IsCFICall)
5013 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
5014
5015 InGlue = Chain.getValue(1);
5016 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
5017 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
5018
5019 // Save heapallocsite metadata.
5020 if (CLI.CB)
5021 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
5022 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
5023
5024 // Create the CALLSEQ_END node.
5025 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
5026 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
5027 DAG.getTarget().Options.GuaranteedTailCallOpt))
5028 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
5029 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
5030 // If this call passes a struct-return pointer, the callee
5031 // pops that struct pointer.
5032 NumBytesForCalleeToPop = 4;
5033
5034 // Returns a glue for retval copy to use.
5035 if (!IsSibcall) {
5036 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
5037 InGlue, dl);
5038 InGlue = Chain.getValue(1);
5039 }
5040
5041 // Handle result values, copying them out of physregs into vregs that we
5042 // return.
5043 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
5044 InVals, RegMask);
5045}
5046
5047//===----------------------------------------------------------------------===//
5048// Fast Calling Convention (tail call) implementation
5049//===----------------------------------------------------------------------===//
5050
5051// Like std call, callee cleans arguments, convention except that ECX is
5052// reserved for storing the tail called function address. Only 2 registers are
5053// free for argument passing (inreg). Tail call optimization is performed
5054// provided:
5055// * tailcallopt is enabled
5056// * caller/callee are fastcc
5057// On X86_64 architecture with GOT-style position independent code only local
5058// (within module) calls are supported at the moment.
5059// To keep the stack aligned according to platform abi the function
5060// GetAlignedArgumentStackSize ensures that argument delta is always multiples
5061// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
5062// If a tail called function callee has more arguments than the caller the
5063// caller needs to make sure that there is room to move the RETADDR to. This is
5064// achieved by reserving an area the size of the argument delta right after the
5065// original RETADDR, but before the saved framepointer or the spilled registers
5066// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
5067// stack layout:
5068// arg1
5069// arg2
5070// RETADDR
5071// [ new RETADDR
5072// move area ]
5073// (possible EBP)
5074// ESI
5075// EDI
5076// local1 ..
5077
5078/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
5079/// requirement.
5080unsigned
5081X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
5082 SelectionDAG &DAG) const {
5083 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
5084 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
5085 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__
__PRETTY_FUNCTION__))
5086 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__
__PRETTY_FUNCTION__))
;
5087 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
5088}
5089
5090/// Return true if the given stack call argument is already available in the
5091/// same position (relatively) of the caller's incoming argument stack.
5092static
5093bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
5094 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
5095 const X86InstrInfo *TII, const CCValAssign &VA) {
5096 unsigned Bytes = Arg.getValueSizeInBits() / 8;
5097
5098 for (;;) {
5099 // Look through nodes that don't alter the bits of the incoming value.
5100 unsigned Op = Arg.getOpcode();
5101 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
5102 Arg = Arg.getOperand(0);
5103 continue;
5104 }
5105 if (Op == ISD::TRUNCATE) {
5106 const SDValue &TruncInput = Arg.getOperand(0);
5107 if (TruncInput.getOpcode() == ISD::AssertZext &&
5108 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
5109 Arg.getValueType()) {
5110 Arg = TruncInput.getOperand(0);
5111 continue;
5112 }
5113 }
5114 break;
5115 }
5116
5117 int FI = INT_MAX2147483647;
5118 if (Arg.getOpcode() == ISD::CopyFromReg) {
5119 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
5120 if (!VR.isVirtual())
5121 return false;
5122 MachineInstr *Def = MRI->getVRegDef(VR);
5123 if (!Def)
5124 return false;
5125 if (!Flags.isByVal()) {
5126 if (!TII->isLoadFromStackSlot(*Def, FI))
5127 return false;
5128 } else {
5129 unsigned Opcode = Def->getOpcode();
5130 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
5131 Opcode == X86::LEA64_32r) &&
5132 Def->getOperand(1).isFI()) {
5133 FI = Def->getOperand(1).getIndex();
5134 Bytes = Flags.getByValSize();
5135 } else
5136 return false;
5137 }
5138 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
5139 if (Flags.isByVal())
5140 // ByVal argument is passed in as a pointer but it's now being
5141 // dereferenced. e.g.
5142 // define @foo(%struct.X* %A) {
5143 // tail call @bar(%struct.X* byval %A)
5144 // }
5145 return false;
5146 SDValue Ptr = Ld->getBasePtr();
5147 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
5148 if (!FINode)
5149 return false;
5150 FI = FINode->getIndex();
5151 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
5152 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
5153 FI = FINode->getIndex();
5154 Bytes = Flags.getByValSize();
5155 } else
5156 return false;
5157
5158 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5158, __extension__ __PRETTY_FUNCTION__))
;
5159 if (!MFI.isFixedObjectIndex(FI))
5160 return false;
5161
5162 if (Offset != MFI.getObjectOffset(FI))
5163 return false;
5164
5165 // If this is not byval, check that the argument stack object is immutable.
5166 // inalloca and argument copy elision can create mutable argument stack
5167 // objects. Byval objects can be mutated, but a byval call intends to pass the
5168 // mutated memory.
5169 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
5170 return false;
5171
5172 if (VA.getLocVT().getFixedSizeInBits() >
5173 Arg.getValueSizeInBits().getFixedValue()) {
5174 // If the argument location is wider than the argument type, check that any
5175 // extension flags match.
5176 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
5177 Flags.isSExt() != MFI.isObjectSExt(FI)) {
5178 return false;
5179 }
5180 }
5181
5182 return Bytes == MFI.getObjectSize(FI);
5183}
5184
5185/// Check whether the call is eligible for tail call optimization. Targets
5186/// that want to do tail call optimization should implement this function.
5187bool X86TargetLowering::IsEligibleForTailCallOptimization(
5188 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
5189 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
5190 const SmallVectorImpl<SDValue> &OutVals,
5191 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5192 if (!mayTailCallThisCC(CalleeCC))
5193 return false;
5194
5195 // If -tailcallopt is specified, make fastcc functions tail-callable.
5196 MachineFunction &MF = DAG.getMachineFunction();
5197 const Function &CallerF = MF.getFunction();
5198
5199 // If the function return type is x86_fp80 and the callee return type is not,
5200 // then the FP_EXTEND of the call result is not a nop. It's not safe to
5201 // perform a tailcall optimization here.
5202 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
5203 return false;
5204
5205 CallingConv::ID CallerCC = CallerF.getCallingConv();
5206 bool CCMatch = CallerCC == CalleeCC;
5207 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
5208 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
5209 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
5210 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
5211
5212 // Win64 functions have extra shadow space for argument homing. Don't do the
5213 // sibcall if the caller and callee have mismatched expectations for this
5214 // space.
5215 if (IsCalleeWin64 != IsCallerWin64)
5216 return false;
5217
5218 if (IsGuaranteeTCO) {
5219 if (canGuaranteeTCO(CalleeCC) && CCMatch)
5220 return true;
5221 return false;
5222 }
5223
5224 // Look for obvious safe cases to perform tail call optimization that do not
5225 // require ABI changes. This is what gcc calls sibcall.
5226
5227 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
5228 // emit a special epilogue.
5229 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5230 if (RegInfo->hasStackRealignment(MF))
5231 return false;
5232
5233 // Also avoid sibcall optimization if we're an sret return fn and the callee
5234 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
5235 // insufficient.
5236 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
5237 // For a compatible tail call the callee must return our sret pointer. So it
5238 // needs to be (a) an sret function itself and (b) we pass our sret as its
5239 // sret. Condition #b is harder to determine.
5240 return false;
5241 } else if (IsCalleePopSRet)
5242 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
5243 // expect that.
5244 return false;
5245
5246 // Do not sibcall optimize vararg calls unless all arguments are passed via
5247 // registers.
5248 LLVMContext &C = *DAG.getContext();
5249 if (isVarArg && !Outs.empty()) {
5250 // Optimizing for varargs on Win64 is unlikely to be safe without
5251 // additional testing.
5252 if (IsCalleeWin64 || IsCallerWin64)
5253 return false;
5254
5255 SmallVector<CCValAssign, 16> ArgLocs;
5256 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5257
5258 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5259 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
5260 if (!ArgLocs[i].isRegLoc())
5261 return false;
5262 }
5263
5264 // If the call result is in ST0 / ST1, it needs to be popped off the x87
5265 // stack. Therefore, if it's not used by the call it is not safe to optimize
5266 // this into a sibcall.
5267 bool Unused = false;
5268 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5269 if (!Ins[i].Used) {
5270 Unused = true;
5271 break;
5272 }
5273 }
5274 if (Unused) {
5275 SmallVector<CCValAssign, 16> RVLocs;
5276 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
5277 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
5278 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5279 CCValAssign &VA = RVLocs[i];
5280 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
5281 return false;
5282 }
5283 }
5284
5285 // Check that the call results are passed in the same way.
5286 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5287 RetCC_X86, RetCC_X86))
5288 return false;
5289 // The callee has to preserve all registers the caller needs to preserve.
5290 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5291 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5292 if (!CCMatch) {
5293 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5294 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5295 return false;
5296 }
5297
5298 unsigned StackArgsSize = 0;
5299
5300 // If the callee takes no arguments then go on to check the results of the
5301 // call.
5302 if (!Outs.empty()) {
5303 // Check if stack adjustment is needed. For now, do not do this if any
5304 // argument is passed on the stack.
5305 SmallVector<CCValAssign, 16> ArgLocs;
5306 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5307
5308 // Allocate shadow area for Win64
5309 if (IsCalleeWin64)
5310 CCInfo.AllocateStack(32, Align(8));
5311
5312 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5313 StackArgsSize = CCInfo.getNextStackOffset();
5314
5315 if (CCInfo.getNextStackOffset()) {
5316 // Check if the arguments are already laid out in the right way as
5317 // the caller's fixed stack objects.
5318 MachineFrameInfo &MFI = MF.getFrameInfo();
5319 const MachineRegisterInfo *MRI = &MF.getRegInfo();
5320 const X86InstrInfo *TII = Subtarget.getInstrInfo();
5321 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5322 CCValAssign &VA = ArgLocs[i];
5323 SDValue Arg = OutVals[i];
5324 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5325 if (VA.getLocInfo() == CCValAssign::Indirect)
5326 return false;
5327 if (!VA.isRegLoc()) {
5328 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5329 MFI, MRI, TII, VA))
5330 return false;
5331 }
5332 }
5333 }
5334
5335 bool PositionIndependent = isPositionIndependent();
5336 // If the tailcall address may be in a register, then make sure it's
5337 // possible to register allocate for it. In 32-bit, the call address can
5338 // only target EAX, EDX, or ECX since the tail call must be scheduled after
5339 // callee-saved registers are restored. These happen to be the same
5340 // registers used to pass 'inreg' arguments so watch out for those.
5341 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5342 !isa<ExternalSymbolSDNode>(Callee)) ||
5343 PositionIndependent)) {
5344 unsigned NumInRegs = 0;
5345 // In PIC we need an extra register to formulate the address computation
5346 // for the callee.
5347 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5348
5349 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5350 CCValAssign &VA = ArgLocs[i];
5351 if (!VA.isRegLoc())
5352 continue;
5353 Register Reg = VA.getLocReg();
5354 switch (Reg) {
5355 default: break;
5356 case X86::EAX: case X86::EDX: case X86::ECX:
5357 if (++NumInRegs == MaxInRegs)
5358 return false;
5359 break;
5360 }
5361 }
5362 }
5363
5364 const MachineRegisterInfo &MRI = MF.getRegInfo();
5365 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5366 return false;
5367 }
5368
5369 bool CalleeWillPop =
5370 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5371 MF.getTarget().Options.GuaranteedTailCallOpt);
5372
5373 if (unsigned BytesToPop =
5374 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5375 // If we have bytes to pop, the callee must pop them.
5376 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5377 if (!CalleePopMatches)
5378 return false;
5379 } else if (CalleeWillPop && StackArgsSize > 0) {
5380 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5381 return false;
5382 }
5383
5384 return true;
5385}
5386
5387FastISel *
5388X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5389 const TargetLibraryInfo *libInfo) const {
5390 return X86::createFastISel(funcInfo, libInfo);
5391}
5392
5393//===----------------------------------------------------------------------===//
5394// Other Lowering Hooks
5395//===----------------------------------------------------------------------===//
5396
5397bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5398 bool AssumeSingleUse) {
5399 if (!AssumeSingleUse && !Op.hasOneUse())
5400 return false;
5401 if (!ISD::isNormalLoad(Op.getNode()))
5402 return false;
5403
5404 // If this is an unaligned vector, make sure the target supports folding it.
5405 auto *Ld = cast<LoadSDNode>(Op.getNode());
5406 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5407 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
5408 return false;
5409
5410 // TODO: If this is a non-temporal load and the target has an instruction
5411 // for it, it should not be folded. See "useNonTemporalLoad()".
5412
5413 return true;
5414}
5415
5416bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5417 const X86Subtarget &Subtarget,
5418 bool AssumeSingleUse) {
5419 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5419, __extension__
__PRETTY_FUNCTION__))
;
5420 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5421 return false;
5422
5423 // We can not replace a wide volatile load with a broadcast-from-memory,
5424 // because that would narrow the load, which isn't legal for volatiles.
5425 auto *Ld = cast<LoadSDNode>(Op.getNode());
5426 return !Ld->isVolatile() ||
5427 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5428}
5429
5430bool X86::mayFoldIntoStore(SDValue Op) {
5431 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5432}
5433
5434bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5435 if (Op.hasOneUse()) {
5436 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5437 return (ISD::ZERO_EXTEND == Opcode);
5438 }
5439 return false;
5440}
5441
5442static bool isTargetShuffle(unsigned Opcode) {
5443 switch(Opcode) {
5444 default: return false;
5445 case X86ISD::BLENDI:
5446 case X86ISD::PSHUFB:
5447 case X86ISD::PSHUFD:
5448 case X86ISD::PSHUFHW:
5449 case X86ISD::PSHUFLW:
5450 case X86ISD::SHUFP:
5451 case X86ISD::INSERTPS:
5452 case X86ISD::EXTRQI:
5453 case X86ISD::INSERTQI:
5454 case X86ISD::VALIGN:
5455 case X86ISD::PALIGNR:
5456 case X86ISD::VSHLDQ:
5457 case X86ISD::VSRLDQ:
5458 case X86ISD::MOVLHPS:
5459 case X86ISD::MOVHLPS:
5460 case X86ISD::MOVSHDUP:
5461 case X86ISD::MOVSLDUP:
5462 case X86ISD::MOVDDUP:
5463 case X86ISD::MOVSS:
5464 case X86ISD::MOVSD:
5465 case X86ISD::MOVSH:
5466 case X86ISD::UNPCKL:
5467 case X86ISD::UNPCKH:
5468 case X86ISD::VBROADCAST:
5469 case X86ISD::VPERMILPI:
5470 case X86ISD::VPERMILPV:
5471 case X86ISD::VPERM2X128:
5472 case X86ISD::SHUF128:
5473 case X86ISD::VPERMIL2:
5474 case X86ISD::VPERMI:
5475 case X86ISD::VPPERM:
5476 case X86ISD::VPERMV:
5477 case X86ISD::VPERMV3:
5478 case X86ISD::VZEXT_MOVL:
5479 return true;
5480 }
5481}
5482
5483static bool isTargetShuffleVariableMask(unsigned Opcode) {
5484 switch (Opcode) {
5485 default: return false;
5486 // Target Shuffles.
5487 case X86ISD::PSHUFB:
5488 case X86ISD::VPERMILPV:
5489 case X86ISD::VPERMIL2:
5490 case X86ISD::VPPERM:
5491 case X86ISD::VPERMV:
5492 case X86ISD::VPERMV3:
5493 return true;
5494 // 'Faux' Target Shuffles.
5495 case ISD::OR:
5496 case ISD::AND:
5497 case X86ISD::ANDNP:
5498 return true;
5499 }
5500}
5501
5502SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5503 MachineFunction &MF = DAG.getMachineFunction();
5504 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5505 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5506 int ReturnAddrIndex = FuncInfo->getRAIndex();
5507
5508 if (ReturnAddrIndex == 0) {
5509 // Set up a frame object for the return address.
5510 unsigned SlotSize = RegInfo->getSlotSize();
5511 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5512 -(int64_t)SlotSize,
5513 false);
5514 FuncInfo->setRAIndex(ReturnAddrIndex);
5515 }
5516
5517 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5518}
5519
5520bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5521 bool hasSymbolicDisplacement) {
5522 // Offset should fit into 32 bit immediate field.
5523 if (!isInt<32>(Offset))
5524 return false;
5525
5526 // If we don't have a symbolic displacement - we don't have any extra
5527 // restrictions.
5528 if (!hasSymbolicDisplacement)
5529 return true;
5530
5531 // FIXME: Some tweaks might be needed for medium code model.
5532 if (M != CodeModel::Small && M != CodeModel::Kernel)
5533 return false;
5534
5535 // For small code model we assume that latest object is 16MB before end of 31
5536 // bits boundary. We may also accept pretty large negative constants knowing
5537 // that all objects are in the positive half of address space.
5538 if (M == CodeModel::Small && Offset < 16*1024*1024)
5539 return true;
5540
5541 // For kernel code model we know that all object resist in the negative half
5542 // of 32bits address space. We may not accept negative offsets, since they may
5543 // be just off and we may accept pretty large positive ones.
5544 if (M == CodeModel::Kernel && Offset >= 0)
5545 return true;
5546
5547 return false;
5548}
5549
5550/// Determines whether the callee is required to pop its own arguments.
5551/// Callee pop is necessary to support tail calls.
5552bool X86::isCalleePop(CallingConv::ID CallingConv,
5553 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5554 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5555 // can guarantee TCO.
5556 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5557 return true;
5558
5559 switch (CallingConv) {
5560 default:
5561 return false;
5562 case CallingConv::X86_StdCall:
5563 case CallingConv::X86_FastCall:
5564 case CallingConv::X86_ThisCall:
5565 case CallingConv::X86_VectorCall:
5566 return !is64Bit;
5567 }
5568}
5569
5570/// Return true if the condition is an signed comparison operation.
5571static bool isX86CCSigned(unsigned X86CC) {
5572 switch (X86CC) {
5573 default:
5574 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5574)
;
5575 case X86::COND_E:
5576 case X86::COND_NE:
5577 case X86::COND_B:
5578 case X86::COND_A:
5579 case X86::COND_BE:
5580 case X86::COND_AE:
5581 return false;
5582 case X86::COND_G:
5583 case X86::COND_GE:
5584 case X86::COND_L:
5585 case X86::COND_LE:
5586 return true;
5587 }
5588}
5589
5590static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5591 switch (SetCCOpcode) {
5592 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5592)
;
5593 case ISD::SETEQ: return X86::COND_E;
5594 case ISD::SETGT: return X86::COND_G;
5595 case ISD::SETGE: return X86::COND_GE;
5596 case ISD::SETLT: return X86::COND_L;
5597 case ISD::SETLE: return X86::COND_LE;
5598 case ISD::SETNE: return X86::COND_NE;
5599 case ISD::SETULT: return X86::COND_B;
5600 case ISD::SETUGT: return X86::COND_A;
5601 case ISD::SETULE: return X86::COND_BE;
5602 case ISD::SETUGE: return X86::COND_AE;
5603 }
5604}
5605
5606/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5607/// condition code, returning the condition code and the LHS/RHS of the
5608/// comparison to make.
5609static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5610 bool isFP, SDValue &LHS, SDValue &RHS,
5611 SelectionDAG &DAG) {
5612 if (!isFP) {
5613 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5614 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5615 // X > -1 -> X == 0, jump !sign.
5616 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5617 return X86::COND_NS;
5618 }
5619 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5620 // X < 0 -> X == 0, jump on sign.
5621 return X86::COND_S;
5622 }
5623 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5624 // X >= 0 -> X == 0, jump on !sign.
5625 return X86::COND_NS;
5626 }
5627 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5628 // X < 1 -> X <= 0
5629 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5630 return X86::COND_LE;
5631 }
5632 }
5633
5634 return TranslateIntegerX86CC(SetCCOpcode);
5635 }
5636
5637 // First determine if it is required or is profitable to flip the operands.
5638
5639 // If LHS is a foldable load, but RHS is not, flip the condition.
5640 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5641 !ISD::isNON_EXTLoad(RHS.getNode())) {
5642 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5643 std::swap(LHS, RHS);
5644 }
5645
5646 switch (SetCCOpcode) {
5647 default: break;
5648 case ISD::SETOLT:
5649 case ISD::SETOLE:
5650 case ISD::SETUGT:
5651 case ISD::SETUGE:
5652 std::swap(LHS, RHS);
5653 break;
5654 }
5655
5656 // On a floating point condition, the flags are set as follows:
5657 // ZF PF CF op
5658 // 0 | 0 | 0 | X > Y
5659 // 0 | 0 | 1 | X < Y
5660 // 1 | 0 | 0 | X == Y
5661 // 1 | 1 | 1 | unordered
5662 switch (SetCCOpcode) {
5663 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5663)
;
5664 case ISD::SETUEQ:
5665 case ISD::SETEQ: return X86::COND_E;
5666 case ISD::SETOLT: // flipped
5667 case ISD::SETOGT:
5668 case ISD::SETGT: return X86::COND_A;
5669 case ISD::SETOLE: // flipped
5670 case ISD::SETOGE:
5671 case ISD::SETGE: return X86::COND_AE;
5672 case ISD::SETUGT: // flipped
5673 case ISD::SETULT:
5674 case ISD::SETLT: return X86::COND_B;
5675 case ISD::SETUGE: // flipped
5676 case ISD::SETULE:
5677 case ISD::SETLE: return X86::COND_BE;
5678 case ISD::SETONE:
5679 case ISD::SETNE: return X86::COND_NE;
5680 case ISD::SETUO: return X86::COND_P;
5681 case ISD::SETO: return X86::COND_NP;
5682 case ISD::SETOEQ:
5683 case ISD::SETUNE: return X86::COND_INVALID;
5684 }
5685}
5686
5687/// Is there a floating point cmov for the specific X86 condition code?
5688/// Current x86 isa includes the following FP cmov instructions:
5689/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5690static bool hasFPCMov(unsigned X86CC) {
5691 switch (X86CC) {
5692 default:
5693 return false;
5694 case X86::COND_B:
5695 case X86::COND_BE:
5696 case X86::COND_E:
5697 case X86::COND_P:
5698 case X86::COND_A:
5699 case X86::COND_AE:
5700 case X86::COND_NE:
5701 case X86::COND_NP:
5702 return true;
5703 }
5704}
5705
5706static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5707 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5708 VT.is512BitVector();
5709}
5710
5711bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5712 const CallInst &I,
5713 MachineFunction &MF,
5714 unsigned Intrinsic) const {
5715 Info.flags = MachineMemOperand::MONone;
5716 Info.offset = 0;
5717
5718 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5719 if (!IntrData) {
5720 switch (Intrinsic) {
5721 case Intrinsic::x86_aesenc128kl:
5722 case Intrinsic::x86_aesdec128kl:
5723 Info.opc = ISD::INTRINSIC_W_CHAIN;
5724 Info.ptrVal = I.getArgOperand(1);
5725 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5726 Info.align = Align(1);
5727 Info.flags |= MachineMemOperand::MOLoad;
5728 return true;
5729 case Intrinsic::x86_aesenc256kl:
5730 case Intrinsic::x86_aesdec256kl:
5731 Info.opc = ISD::INTRINSIC_W_CHAIN;
5732 Info.ptrVal = I.getArgOperand(1);
5733 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5734 Info.align = Align(1);
5735 Info.flags |= MachineMemOperand::MOLoad;
5736 return true;
5737 case Intrinsic::x86_aesencwide128kl:
5738 case Intrinsic::x86_aesdecwide128kl:
5739 Info.opc = ISD::INTRINSIC_W_CHAIN;
5740 Info.ptrVal = I.getArgOperand(0);
5741 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5742 Info.align = Align(1);
5743 Info.flags |= MachineMemOperand::MOLoad;
5744 return true;
5745 case Intrinsic::x86_aesencwide256kl:
5746 case Intrinsic::x86_aesdecwide256kl:
5747 Info.opc = ISD::INTRINSIC_W_CHAIN;
5748 Info.ptrVal = I.getArgOperand(0);
5749 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5750 Info.align = Align(1);
5751 Info.flags |= MachineMemOperand::MOLoad;
5752 return true;
5753 case Intrinsic::x86_cmpccxadd32:
5754 case Intrinsic::x86_cmpccxadd64:
5755 case Intrinsic::x86_atomic_bts:
5756 case Intrinsic::x86_atomic_btc:
5757 case Intrinsic::x86_atomic_btr: {
5758 Info.opc = ISD::INTRINSIC_W_CHAIN;
5759 Info.ptrVal = I.getArgOperand(0);
5760 unsigned Size = I.getType()->getScalarSizeInBits();
5761 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5762 Info.align = Align(Size);
5763 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5764 MachineMemOperand::MOVolatile;
5765 return true;
5766 }
5767 case Intrinsic::x86_atomic_bts_rm:
5768 case Intrinsic::x86_atomic_btc_rm:
5769 case Intrinsic::x86_atomic_btr_rm: {
5770 Info.opc = ISD::INTRINSIC_W_CHAIN;
5771 Info.ptrVal = I.getArgOperand(0);
5772 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5773 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5774 Info.align = Align(Size);
5775 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5776 MachineMemOperand::MOVolatile;
5777 return true;
5778 }
5779 case Intrinsic::x86_aadd32:
5780 case Intrinsic::x86_aadd64:
5781 case Intrinsic::x86_aand32:
5782 case Intrinsic::x86_aand64:
5783 case Intrinsic::x86_aor32:
5784 case Intrinsic::x86_aor64:
5785 case Intrinsic::x86_axor32:
5786 case Intrinsic::x86_axor64:
5787 case Intrinsic::x86_atomic_add_cc:
5788 case Intrinsic::x86_atomic_sub_cc:
5789 case Intrinsic::x86_atomic_or_cc:
5790 case Intrinsic::x86_atomic_and_cc:
5791 case Intrinsic::x86_atomic_xor_cc: {
5792 Info.opc = ISD::INTRINSIC_W_CHAIN;
5793 Info.ptrVal = I.getArgOperand(0);
5794 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5795 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5796 Info.align = Align(Size);
5797 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5798 MachineMemOperand::MOVolatile;
5799 return true;
5800 }
5801 }
5802 return false;
5803 }
5804
5805 switch (IntrData->Type) {
5806 case TRUNCATE_TO_MEM_VI8:
5807 case TRUNCATE_TO_MEM_VI16:
5808 case TRUNCATE_TO_MEM_VI32: {
5809 Info.opc = ISD::INTRINSIC_VOID;
5810 Info.ptrVal = I.getArgOperand(0);
5811 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5812 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5813 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5814 ScalarVT = MVT::i8;
5815 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5816 ScalarVT = MVT::i16;
5817 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5818 ScalarVT = MVT::i32;
5819
5820 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5821 Info.align = Align(1);
5822 Info.flags |= MachineMemOperand::MOStore;
5823 break;
5824 }
5825 case GATHER:
5826 case GATHER_AVX2: {
5827 Info.opc = ISD::INTRINSIC_W_CHAIN;
5828 Info.ptrVal = nullptr;
5829 MVT DataVT = MVT::getVT(I.getType());
5830 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5831 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5832 IndexVT.getVectorNumElements());
5833 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5834 Info.align = Align(1);
5835 Info.flags |= MachineMemOperand::MOLoad;
5836 break;
5837 }
5838 case SCATTER: {
5839 Info.opc = ISD::INTRINSIC_VOID;
5840 Info.ptrVal = nullptr;
5841 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5842 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5843 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5844 IndexVT.getVectorNumElements());
5845 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5846 Info.align = Align(1);
5847 Info.flags |= MachineMemOperand::MOStore;
5848 break;
5849 }
5850 default:
5851 return false;
5852 }
5853
5854 return true;
5855}
5856
5857/// Returns true if the target can instruction select the
5858/// specified FP immediate natively. If false, the legalizer will
5859/// materialize the FP immediate as a load from a constant pool.
5860bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5861 bool ForCodeSize) const {
5862 for (const APFloat &FPImm : LegalFPImmediates)
5863 if (Imm.bitwiseIsEqual(FPImm))
5864 return true;
5865 return false;
5866}
5867
5868bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5869 ISD::LoadExtType ExtTy,
5870 EVT NewVT) const {
5871 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5871, __extension__
__PRETTY_FUNCTION__))
;
5872
5873 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5874 // relocation target a movq or addq instruction: don't let the load shrink.
5875 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5876 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5877 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5878 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5879
5880 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5881 // those uses are extracted directly into a store, then the extract + store
5882 // can be store-folded. Therefore, it's probably not worth splitting the load.
5883 EVT VT = Load->getValueType(0);
5884 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5885 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5886 // Skip uses of the chain value. Result 0 of the node is the load value.
5887 if (UI.getUse().getResNo() != 0)
5888 continue;
5889
5890 // If this use is not an extract + store, it's probably worth splitting.
5891 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5892 UI->use_begin()->getOpcode() != ISD::STORE)
5893 return true;
5894 }
5895 // All non-chain uses are extract + store.
5896 return false;
5897 }
5898
5899 return true;
5900}
5901
5902/// Returns true if it is beneficial to convert a load of a constant
5903/// to just the constant itself.
5904bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5905 Type *Ty) const {
5906 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5906, __extension__ __PRETTY_FUNCTION__))
;
5907
5908 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5909 if (BitSize == 0 || BitSize > 64)
5910 return false;
5911 return true;
5912}
5913
5914bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5915 // If we are using XMM registers in the ABI and the condition of the select is
5916 // a floating-point compare and we have blendv or conditional move, then it is
5917 // cheaper to select instead of doing a cross-register move and creating a
5918 // load that depends on the compare result.
5919 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5920 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5921}
5922
5923bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5924 // TODO: It might be a win to ease or lift this restriction, but the generic
5925 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5926 if (VT.isVector() && Subtarget.hasAVX512())
5927 return false;
5928
5929 return true;
5930}
5931
5932bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5933 SDValue C) const {
5934 // TODO: We handle scalars using custom code, but generic combining could make
5935 // that unnecessary.
5936 APInt MulC;
5937 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5938 return false;
5939
5940 // Find the type this will be legalized too. Otherwise we might prematurely
5941 // convert this to shl+add/sub and then still have to type legalize those ops.
5942 // Another choice would be to defer the decision for illegal types until
5943 // after type legalization. But constant splat vectors of i64 can't make it
5944 // through type legalization on 32-bit targets so we would need to special
5945 // case vXi64.
5946 while (getTypeAction(Context, VT) != TypeLegal)
5947 VT = getTypeToTransformTo(Context, VT);
5948
5949 // If vector multiply is legal, assume that's faster than shl + add/sub.
5950 // Multiply is a complex op with higher latency and lower throughput in
5951 // most implementations, sub-vXi32 vector multiplies are always fast,
5952 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5953 // is always going to be slow.
5954 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5955 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5956 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5957 return false;
5958
5959 // shl+add, shl+sub, shl+add+neg
5960 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5961 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5962}
5963
5964bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5965 unsigned Index) const {
5966 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5967 return false;
5968
5969 // Mask vectors support all subregister combinations and operations that
5970 // extract half of vector.
5971 if (ResVT.getVectorElementType() == MVT::i1)
5972 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5973 (Index == ResVT.getVectorNumElements()));
5974
5975 return (Index % ResVT.getVectorNumElements()) == 0;
5976}
5977
5978bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5979 unsigned Opc = VecOp.getOpcode();
5980
5981 // Assume target opcodes can't be scalarized.
5982 // TODO - do we have any exceptions?
5983 if (Opc >= ISD::BUILTIN_OP_END)
5984 return false;
5985
5986 // If the vector op is not supported, try to convert to scalar.
5987 EVT VecVT = VecOp.getValueType();
5988 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5989 return true;
5990
5991 // If the vector op is supported, but the scalar op is not, the transform may
5992 // not be worthwhile.
5993 EVT ScalarVT = VecVT.getScalarType();
5994 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5995}
5996
5997bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5998 bool) const {
5999 // TODO: Allow vectors?
6000 if (VT.isVector())
6001 return false;
6002 return VT.isSimple() || !isOperationExpand(Opcode, VT);
6003}
6004
6005bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
6006 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
6007 return Subtarget.hasBMI() ||
6008 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
6009}
6010
6011bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
6012 // Speculate ctlz only if we can directly use LZCNT.
6013 return Subtarget.hasLZCNT();
6014}
6015
6016bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
6017 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
6018 // expensive than a straight movsd. On the other hand, it's important to
6019 // shrink long double fp constant since fldt is very slow.
6020 return !Subtarget.hasSSE2() || VT == MVT::f80;
6021}
6022
6023bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
6024 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
6025 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
6026}
6027
6028bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
6029 const SelectionDAG &DAG,
6030 const MachineMemOperand &MMO) const {
6031 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
6032 BitcastVT.getVectorElementType() == MVT::i1)
6033 return false;
6034
6035 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
6036 return false;
6037
6038 // If both types are legal vectors, it's always ok to convert them.
6039 if (LoadVT.isVector() && BitcastVT.isVector() &&
6040 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
6041 return true;
6042
6043 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
6044}
6045
6046bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
6047 const MachineFunction &MF) const {
6048 // Do not merge to float value size (128 bytes) if no implicit
6049 // float attribute is set.
6050 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
6051
6052 if (NoFloat) {
6053 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
6054 return (MemVT.getSizeInBits() <= MaxIntSize);
6055 }
6056 // Make sure we don't merge greater than our preferred vector
6057 // width.
6058 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
6059 return false;
6060
6061 return true;
6062}
6063
6064bool X86TargetLowering::isCtlzFast() const {
6065 return Subtarget.hasFastLZCNT();
6066}
6067
6068bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
6069 const Instruction &AndI) const {
6070 return true;
6071}
6072
6073bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
6074 EVT VT = Y.getValueType();
6075
6076 if (VT.isVector())
6077 return false;
6078
6079 if (!Subtarget.hasBMI())
6080 return false;
6081
6082 // There are only 32-bit and 64-bit forms for 'andn'.
6083 if (VT != MVT::i32 && VT != MVT::i64)
6084 return false;
6085
6086 return !isa<ConstantSDNode>(Y);
6087}
6088
6089bool X86TargetLowering::hasAndNot(SDValue Y) const {
6090 EVT VT = Y.getValueType();
6091
6092 if (!VT.isVector())
6093 return hasAndNotCompare(Y);
6094
6095 // Vector.
6096
6097 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
6098 return false;
6099
6100 if (VT == MVT::v4i32)
6101 return true;
6102
6103 return Subtarget.hasSSE2();
6104}
6105
6106bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
6107 return X.getValueType().isScalarInteger(); // 'bt'
6108}
6109
6110bool X86TargetLowering::
6111 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6112 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
6113 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
6114 SelectionDAG &DAG) const {
6115 // Does baseline recommend not to perform the fold by default?
6116 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6117 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
6118 return false;
6119 // For scalars this transform is always beneficial.
6120 if (X.getValueType().isScalarInteger())
6121 return true;
6122 // If all the shift amounts are identical, then transform is beneficial even
6123 // with rudimentary SSE2 shifts.
6124 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
6125 return true;
6126 // If we have AVX2 with it's powerful shift operations, then it's also good.
6127 if (Subtarget.hasAVX2())
6128 return true;
6129 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
6130 return NewShiftOpcode == ISD::SHL;
6131}
6132
6133bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
6134 return N->getOpcode() != ISD::FP_EXTEND;
6135}
6136
6137bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
6138 const SDNode *N, CombineLevel Level) const {
6139 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6140 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6141 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6142 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6143 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
;
6144 // TODO: Should we always create i64 masks? Or only folded immediates?
6145 EVT VT = N->getValueType(0);
6146 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
6147 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
6148 // Only fold if the shift values are equal - so it folds to AND.
6149 // TODO - we should fold if either is a non-uniform vector but we don't do
6150 // the fold for non-splats yet.
6151 return N->getOperand(1) == N->getOperand(0).getOperand(1);
6152 }
6153 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
6154}
6155
6156bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
6157 EVT VT = Y.getValueType();
6158
6159 // For vectors, we don't have a preference, but we probably want a mask.
6160 if (VT.isVector())
6161 return false;
6162
6163 // 64-bit shifts on 32-bit targets produce really bad bloated code.
6164 if (VT == MVT::i64 && !Subtarget.is64Bit())
6165 return false;
6166
6167 return true;
6168}
6169
6170TargetLowering::ShiftLegalizationStrategy
6171X86TargetLowering::preferredShiftLegalizationStrategy(
6172 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
6173 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
6174 !Subtarget.isOSWindows())
6175 return ShiftLegalizationStrategy::LowerToLibcall;
6176 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
6177 ExpansionFactor);
6178}
6179
6180bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
6181 // Any legal vector type can be splatted more efficiently than
6182 // loading/spilling from memory.
6183 return isTypeLegal(VT);
6184}
6185
6186MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
6187 MVT VT = MVT::getIntegerVT(NumBits);
6188 if (isTypeLegal(VT))
6189 return VT;
6190
6191 // PMOVMSKB can handle this.
6192 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
6193 return MVT::v16i8;
6194
6195 // VPMOVMSKB can handle this.
6196 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
6197 return MVT::v32i8;
6198
6199 // TODO: Allow 64-bit type for 32-bit target.
6200 // TODO: 512-bit types should be allowed, but make sure that those
6201 // cases are handled in combineVectorSizedSetCCEquality().
6202
6203 return MVT::INVALID_SIMPLE_VALUE_TYPE;
6204}
6205
6206/// Val is the undef sentinel value or equal to the specified value.
6207static bool isUndefOrEqual(int Val, int CmpVal) {
6208 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
6209}
6210
6211/// Return true if every element in Mask is the undef sentinel value or equal to
6212/// the specified value..
6213static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
6214 return llvm::all_of(Mask, [CmpVal](int M) {
6215 return (M == SM_SentinelUndef) || (M == CmpVal);
6216 });
6217}
6218
6219/// Val is either the undef or zero sentinel value.
6220static bool isUndefOrZero(int Val) {
6221 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
6222}
6223
6224/// Return true if every element in Mask, beginning from position Pos and ending
6225/// in Pos+Size is the undef sentinel value.
6226static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
6227 return llvm::all_of(Mask.slice(Pos, Size),
6228 [](int M) { return M == SM_SentinelUndef; });
6229}
6230
6231/// Return true if the mask creates a vector whose lower half is undefined.
6232static bool isUndefLowerHalf(ArrayRef<int> Mask) {
6233 unsigned NumElts = Mask.size();
6234 return isUndefInRange(Mask, 0, NumElts / 2);
6235}
6236
6237/// Return true if the mask creates a vector whose upper half is undefined.
6238static bool isUndefUpperHalf(ArrayRef<int> Mask) {
6239 unsigned NumElts = Mask.size();
6240 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
6241}
6242
6243/// Return true if Val falls within the specified range (L, H].
6244static bool isInRange(int Val, int Low, int Hi) {
6245 return (Val >= Low && Val < Hi);
6246}
6247
6248/// Return true if the value of any element in Mask falls within the specified
6249/// range (L, H].
6250static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
6251 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
6252}
6253
6254/// Return true if the value of any element in Mask is the zero sentinel value.
6255static bool isAnyZero(ArrayRef<int> Mask) {
6256 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
6257}
6258
6259/// Return true if the value of any element in Mask is the zero or undef
6260/// sentinel values.
6261static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
6262 return llvm::any_of(Mask, [](int M) {
6263 return M == SM_SentinelZero || M == SM_SentinelUndef;
6264 });
6265}
6266
6267/// Return true if Val is undef or if its value falls within the
6268/// specified range (L, H].
6269static bool isUndefOrInRange(int Val, int Low, int Hi) {
6270 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
6271}
6272
6273/// Return true if every element in Mask is undef or if its value
6274/// falls within the specified range (L, H].
6275static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6276 return llvm::all_of(
6277 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
6278}
6279
6280/// Return true if Val is undef, zero or if its value falls within the
6281/// specified range (L, H].
6282static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
6283 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
6284}
6285
6286/// Return true if every element in Mask is undef, zero or if its value
6287/// falls within the specified range (L, H].
6288static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6289 return llvm::all_of(
6290 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
6291}
6292
6293/// Return true if every element in Mask, beginning
6294/// from position Pos and ending in Pos + Size, falls within the specified
6295/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
6296static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
6297 unsigned Size, int Low, int Step = 1) {
6298 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6299 if (!isUndefOrEqual(Mask[i], Low))
6300 return false;
6301 return true;
6302}
6303
6304/// Return true if every element in Mask, beginning
6305/// from position Pos and ending in Pos+Size, falls within the specified
6306/// sequential range (Low, Low+Size], or is undef or is zero.
6307static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6308 unsigned Size, int Low,
6309 int Step = 1) {
6310 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6311 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
6312 return false;
6313 return true;
6314}
6315
6316/// Return true if every element in Mask, beginning
6317/// from position Pos and ending in Pos+Size is undef or is zero.
6318static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6319 unsigned Size) {
6320 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
6321}
6322
6323/// Helper function to test whether a shuffle mask could be
6324/// simplified by widening the elements being shuffled.
6325///
6326/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
6327/// leaves it in an unspecified state.
6328///
6329/// NOTE: This must handle normal vector shuffle masks and *target* vector
6330/// shuffle masks. The latter have the special property of a '-2' representing
6331/// a zero-ed lane of a vector.
6332static bool canWidenShuffleElements(ArrayRef<int> Mask,
6333 SmallVectorImpl<int> &WidenedMask) {
6334 WidenedMask.assign(Mask.size() / 2, 0);
6335 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
6336 int M0 = Mask[i];
6337 int M1 = Mask[i + 1];
6338
6339 // If both elements are undef, its trivial.
6340 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
6341 WidenedMask[i / 2] = SM_SentinelUndef;
6342 continue;
6343 }
6344
6345 // Check for an undef mask and a mask value properly aligned to fit with
6346 // a pair of values. If we find such a case, use the non-undef mask's value.
6347 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6348 WidenedMask[i / 2] = M1 / 2;
6349 continue;
6350 }
6351 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6352 WidenedMask[i / 2] = M0 / 2;
6353 continue;
6354 }
6355
6356 // When zeroing, we need to spread the zeroing across both lanes to widen.
6357 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6358 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6359 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6360 WidenedMask[i / 2] = SM_SentinelZero;
6361 continue;
6362 }
6363 return false;
6364 }
6365
6366 // Finally check if the two mask values are adjacent and aligned with
6367 // a pair.
6368 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6369 WidenedMask[i / 2] = M0 / 2;
6370 continue;
6371 }
6372
6373 // Otherwise we can't safely widen the elements used in this shuffle.
6374 return false;
6375 }
6376 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__
__PRETTY_FUNCTION__))
6377 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__
__PRETTY_FUNCTION__))
;
6378
6379 return true;
6380}
6381
6382static bool canWidenShuffleElements(ArrayRef<int> Mask,
6383 const APInt &Zeroable,
6384 bool V2IsZero,
6385 SmallVectorImpl<int> &WidenedMask) {
6386 // Create an alternative mask with info about zeroable elements.
6387 // Here we do not set undef elements as zeroable.
6388 SmallVector<int, 64> ZeroableMask(Mask);
6389 if (V2IsZero) {
6390 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6390, __extension__
__PRETTY_FUNCTION__))
;
6391 for (int i = 0, Size = Mask.size(); i != Size; ++i)
6392 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6393 ZeroableMask[i] = SM_SentinelZero;
6394 }
6395 return canWidenShuffleElements(ZeroableMask, WidenedMask);
6396}
6397
6398static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6399 SmallVector<int, 32> WidenedMask;
6400 return canWidenShuffleElements(Mask, WidenedMask);
6401}
6402
6403// Attempt to narrow/widen shuffle mask until it matches the target number of
6404// elements.
6405static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6406 SmallVectorImpl<int> &ScaledMask) {
6407 unsigned NumSrcElts = Mask.size();
6408 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__
__PRETTY_FUNCTION__))
6409 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__
__PRETTY_FUNCTION__))
;
6410
6411 // Narrowing is guaranteed to work.
6412 if (NumDstElts >= NumSrcElts) {
6413 int Scale = NumDstElts / NumSrcElts;
6414 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6415 return true;
6416 }
6417
6418 // We have to repeat the widening until we reach the target size, but we can
6419 // split out the first widening as it sets up ScaledMask for us.
6420 if (canWidenShuffleElements(Mask, ScaledMask)) {
6421 while (ScaledMask.size() > NumDstElts) {
6422 SmallVector<int, 16> WidenedMask;
6423 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6424 return false;
6425 ScaledMask = std::move(WidenedMask);
6426 }
6427 return true;
6428 }
6429
6430 return false;
6431}
6432
6433/// Returns true if Elt is a constant zero or a floating point constant +0.0.
6434bool X86::isZeroNode(SDValue Elt) {
6435 return isNullConstant(Elt) || isNullFPConstant(Elt);
6436}
6437
6438// Build a vector of constants.
6439// Use an UNDEF node if MaskElt == -1.
6440// Split 64-bit constants in the 32-bit mode.
6441static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6442 const SDLoc &dl, bool IsMask = false) {
6443
6444 SmallVector<SDValue, 32> Ops;
6445 bool Split = false;
6446
6447 MVT ConstVecVT = VT;
6448 unsigned NumElts = VT.getVectorNumElements();
6449 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6450 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6451 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6452 Split = true;
6453 }
6454
6455 MVT EltVT = ConstVecVT.getVectorElementType();
6456 for (unsigned i = 0; i < NumElts; ++i) {
6457 bool IsUndef = Values[i] < 0 && IsMask;
6458 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6459 DAG.getConstant(Values[i], dl, EltVT);
6460 Ops.push_back(OpNode);
6461 if (Split)
6462 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6463 DAG.getConstant(0, dl, EltVT));
6464 }
6465 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6466 if (Split)
6467 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6468 return ConstsNode;
6469}
6470
6471static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
6472 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6473 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__
__PRETTY_FUNCTION__))
6474 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__
__PRETTY_FUNCTION__))
;
6475 SmallVector<SDValue, 32> Ops;
6476 bool Split = false;
6477
6478 MVT ConstVecVT = VT;
6479 unsigned NumElts = VT.getVectorNumElements();
6480 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6481 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6482 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6483 Split = true;
6484 }
6485
6486 MVT EltVT = ConstVecVT.getVectorElementType();
6487 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6488 if (Undefs[i]) {
6489 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6490 continue;
6491 }
6492 const APInt &V = Bits[i];
6493 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6493, __extension__
__PRETTY_FUNCTION__))
;
6494 if (Split) {
6495 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6496 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6497 } else if (EltVT == MVT::f32) {
6498 APFloat FV(APFloat::IEEEsingle(), V);
6499 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6500 } else if (EltVT == MVT::f64) {
6501 APFloat FV(APFloat::IEEEdouble(), V);
6502 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6503 } else {
6504 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6505 }
6506 }
6507
6508 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6509 return DAG.getBitcast(VT, ConstsNode);
6510}
6511
6512static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
6513 SelectionDAG &DAG, const SDLoc &dl) {
6514 APInt Undefs = APInt::getZero(Bits.size());
6515 return getConstVector(Bits, Undefs, VT, DAG, dl);
6516}
6517
6518/// Returns a vector of specified type with all zero elements.
6519static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6520 SelectionDAG &DAG, const SDLoc &dl) {
6521 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))
6522 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))
6523 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))
;
6524
6525 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6526 // type. This ensures they get CSE'd. But if the integer type is not
6527 // available, use a floating-point +0.0 instead.
6528 SDValue Vec;
6529 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6530 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6531 } else if (VT.isFloatingPoint()) {
6532 Vec = DAG.getConstantFP(+0.0, dl, VT);
6533 } else if (VT.getVectorElementType() == MVT::i1) {
6534 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__
__PRETTY_FUNCTION__))
6535 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__
__PRETTY_FUNCTION__))
;
6536 Vec = DAG.getConstant(0, dl, VT);
6537 } else {
6538 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6539 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6540 }
6541 return DAG.getBitcast(VT, Vec);
6542}
6543
6544// Helper to determine if the ops are all the extracted subvectors come from a
6545// single source. If we allow commute they don't have to be in order (Lo/Hi).
6546static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6547 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6548 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6549 LHS.getValueType() != RHS.getValueType() ||
6550 LHS.getOperand(0) != RHS.getOperand(0))
6551 return SDValue();
6552
6553 SDValue Src = LHS.getOperand(0);
6554 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6555 return SDValue();
6556
6557 unsigned NumElts = LHS.getValueType().getVectorNumElements();
6558 if ((LHS.getConstantOperandAPInt(1) == 0 &&
6559 RHS.getConstantOperandAPInt(1) == NumElts) ||
6560 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6561 LHS.getConstantOperandAPInt(1) == NumElts))
6562 return Src;
6563
6564 return SDValue();
6565}
6566
6567static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6568 const SDLoc &dl, unsigned vectorWidth) {
6569 EVT VT = Vec.getValueType();
6570 EVT ElVT = VT.getVectorElementType();
6571 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6572 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6573 VT.getVectorNumElements() / Factor);
6574
6575 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6576 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6577 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6577, __extension__
__PRETTY_FUNCTION__))
;
6578
6579 // This is the index of the first element of the vectorWidth-bit chunk
6580 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6581 IdxVal &= ~(ElemsPerChunk - 1);
6582
6583 // If the input is a buildvector just emit a smaller one.
6584 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6585 return DAG.getBuildVector(ResultVT, dl,
6586 Vec->ops().slice(IdxVal, ElemsPerChunk));
6587
6588 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6589 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6590}
6591
6592/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6593/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6594/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6595/// instructions or a simple subregister reference. Idx is an index in the
6596/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6597/// lowering EXTRACT_VECTOR_ELT operations easier.
6598static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6599 SelectionDAG &DAG, const SDLoc &dl) {
6600 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))
6601 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))
;
6602 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6603}
6604
6605/// Generate a DAG to grab 256-bits from a 512-bit vector.
6606static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6607 SelectionDAG &DAG, const SDLoc &dl) {
6608 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6608, __extension__
__PRETTY_FUNCTION__))
;
6609 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6610}
6611
6612static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6613 SelectionDAG &DAG, const SDLoc &dl,
6614 unsigned vectorWidth) {
6615 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__
__PRETTY_FUNCTION__))
6616 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__
__PRETTY_FUNCTION__))
;
6617 // Inserting UNDEF is Result
6618 if (Vec.isUndef())
6619 return Result;
6620 EVT VT = Vec.getValueType();
6621 EVT ElVT = VT.getVectorElementType();
6622 EVT ResultVT = Result.getValueType();
6623
6624 // Insert the relevant vectorWidth bits.
6625 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6626 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6626, __extension__
__PRETTY_FUNCTION__))
;
6627
6628 // This is the index of the first element of the vectorWidth-bit chunk
6629 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6630 IdxVal &= ~(ElemsPerChunk - 1);
6631
6632 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6633 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6634}
6635
6636/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6637/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6638/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6639/// simple superregister reference. Idx is an index in the 128 bits
6640/// we want. It need not be aligned to a 128-bit boundary. That makes
6641/// lowering INSERT_VECTOR_ELT operations easier.
6642static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6643 SelectionDAG &DAG, const SDLoc &dl) {
6644 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6644, __extension__
__PRETTY_FUNCTION__))
;
6645 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6646}
6647
6648/// Widen a vector to a larger size with the same scalar type, with the new
6649/// elements either zero or undef.
6650static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6651 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6652 const SDLoc &dl) {
6653 assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))
6654 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))
6655 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))
;
6656 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6657 : DAG.getUNDEF(VT);
6658 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6659 DAG.getIntPtrConstant(0, dl));
6660}
6661
6662/// Widen a vector to a larger size with the same scalar type, with the new
6663/// elements either zero or undef.
6664static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6665 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6666 const SDLoc &dl, unsigned WideSizeInBits) {
6667 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))
6668 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))
6669 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))
;
6670 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6671 MVT SVT = Vec.getSimpleValueType().getScalarType();
6672 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6673 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6674}
6675
6676// Helper function to collect subvector ops that are concatenated together,
6677// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6678// The subvectors in Ops are guaranteed to be the same type.
6679static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
6680 SelectionDAG &DAG) {
6681 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6681, __extension__
__PRETTY_FUNCTION__))
;
6682
6683 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6684 Ops.append(N->op_begin(), N->op_end());
6685 return true;
6686 }
6687
6688 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6689 SDValue Src = N->getOperand(0);
6690 SDValue Sub = N->getOperand(1);
6691 const APInt &Idx = N->getConstantOperandAPInt(2);
6692 EVT VT = Src.getValueType();
6693 EVT SubVT = Sub.getValueType();
6694
6695 // TODO - Handle more general insert_subvector chains.
6696 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
6697 // insert_subvector(undef, x, lo)
6698 if (Idx == 0 && Src.isUndef()) {
6699 Ops.push_back(Sub);
6700 Ops.push_back(DAG.getUNDEF(SubVT));
6701 return true;
6702 }
6703 if (Idx == (VT.getVectorNumElements() / 2)) {
6704 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6705 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6706 Src.getOperand(1).getValueType() == SubVT &&
6707 isNullConstant(Src.getOperand(2))) {
6708 Ops.push_back(Src.getOperand(1));
6709 Ops.push_back(Sub);
6710 return true;
6711 }
6712 // insert_subvector(x, extract_subvector(x, lo), hi)
6713 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6714 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6715 Ops.append(2, Sub);
6716 return true;
6717 }
6718 // insert_subvector(undef, x, hi)
6719 if (Src.isUndef()) {
6720 Ops.push_back(DAG.getUNDEF(SubVT));
6721 Ops.push_back(Sub);
6722 return true;
6723 }
6724 }
6725 }
6726 }
6727
6728 return false;
6729}
6730
6731static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6732 const SDLoc &dl) {
6733 EVT VT = Op.getValueType();
6734 unsigned NumElems = VT.getVectorNumElements();
6735 unsigned SizeInBits = VT.getSizeInBits();
6736 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__
__PRETTY_FUNCTION__))
6737 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__
__PRETTY_FUNCTION__))
;
6738
6739 // If this is a splat value (with no-undefs) then use the lower subvector,
6740 // which should be a free extraction.
6741 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6742 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6743 return std::make_pair(Lo, Lo);
6744
6745 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6746 return std::make_pair(Lo, Hi);
6747}
6748
6749/// Break an operation into 2 half sized ops and then concatenate the results.
6750static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6751 unsigned NumOps = Op.getNumOperands();
6752 EVT VT = Op.getValueType();
6753 SDLoc dl(Op);
6754
6755 // Extract the LHS Lo/Hi vectors
6756 SmallVector<SDValue> LoOps(NumOps, SDValue());
6757 SmallVector<SDValue> HiOps(NumOps, SDValue());
6758 for (unsigned I = 0; I != NumOps; ++I) {
6759 SDValue SrcOp = Op.getOperand(I);
6760 if (!SrcOp.getValueType().isVector()) {
6761 LoOps[I] = HiOps[I] = SrcOp;
6762 continue;
6763 }
6764 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6765 }
6766
6767 EVT LoVT, HiVT;
6768 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6769 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6770 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6771 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6772}
6773
6774/// Break an unary integer operation into 2 half sized ops and then
6775/// concatenate the result back.
6776static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6777 // Make sure we only try to split 256/512-bit types to avoid creating
6778 // narrow vectors.
6779 EVT VT = Op.getValueType();
6780 (void)VT;
6781 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))
6782 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))
6783 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))
;
6784 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))
6785 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))
6786 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))
;
6787 return splitVectorOp(Op, DAG);
6788}
6789
6790/// Break a binary integer operation into 2 half sized ops and then
6791/// concatenate the result back.
6792static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6793 // Assert that all the types match.
6794 EVT VT = Op.getValueType();
6795 (void)VT;
6796 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__
__PRETTY_FUNCTION__))
6797 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__
__PRETTY_FUNCTION__))
;
6798 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__
__PRETTY_FUNCTION__))
;
6799 return splitVectorOp(Op, DAG);
6800}
6801
6802// Helper for splitting operands of an operation to legal target size and
6803// apply a function on each part.
6804// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6805// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6806// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6807// The argument Builder is a function that will be applied on each split part:
6808// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6809template <typename F>
6810SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6811 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6812 F Builder, bool CheckBWI = true) {
6813 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6813, __extension__
__PRETTY_FUNCTION__))
;
6814 unsigned NumSubs = 1;
6815 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6816 (!CheckBWI && Subtarget.useAVX512Regs())) {
6817 if (VT.getSizeInBits() > 512) {
6818 NumSubs = VT.getSizeInBits() / 512;
6819 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6819, __extension__
__PRETTY_FUNCTION__))
;
6820 }
6821 } else if (Subtarget.hasAVX2()) {
6822 if (VT.getSizeInBits() > 256) {
6823 NumSubs = VT.getSizeInBits() / 256;
6824 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6824, __extension__
__PRETTY_FUNCTION__))
;
6825 }
6826 } else {
6827 if (VT.getSizeInBits() > 128) {
6828 NumSubs = VT.getSizeInBits() / 128;
6829 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6829, __extension__
__PRETTY_FUNCTION__))
;
6830 }
6831 }
6832
6833 if (NumSubs == 1)
6834 return Builder(DAG, DL, Ops);
6835
6836 SmallVector<SDValue, 4> Subs;
6837 for (unsigned i = 0; i != NumSubs; ++i) {
6838 SmallVector<SDValue, 2> SubOps;
6839 for (SDValue Op : Ops) {
6840 EVT OpVT = Op.getValueType();
6841 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6842 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6843 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6844 }
6845 Subs.push_back(Builder(DAG, DL, SubOps));
6846 }
6847 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6848}
6849
6850// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6851// targets.
6852static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6853 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6854 const X86Subtarget &Subtarget) {
6855 assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6855, __extension__
__PRETTY_FUNCTION__))
;
6856 MVT SVT = VT.getScalarType();
6857
6858 // If we have a 32/64 splatted constant, splat it to DstTy to
6859 // encourage a foldable broadcast'd operand.
6860 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6861 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6862 // AVX512 broadcasts 32/64-bit operands.
6863 // TODO: Support float once getAVX512Node is used by fp-ops.
6864 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6865 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6866 return SDValue();
6867 // If we're not widening, don't bother if we're not bitcasting.
6868 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6869 return SDValue();
6870 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6871 APInt SplatValue, SplatUndef;
6872 unsigned SplatBitSize;
6873 bool HasAnyUndefs;
6874 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6875 HasAnyUndefs, OpEltSizeInBits) &&
6876 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6877 return DAG.getConstant(SplatValue, DL, DstVT);
6878 }
6879 return SDValue();
6880 };
6881
6882 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6883
6884 MVT DstVT = VT;
6885 if (Widen)
6886 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6887
6888 // Canonicalize src operands.
6889 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6890 for (SDValue &Op : SrcOps) {
6891 MVT OpVT = Op.getSimpleValueType();
6892 // Just pass through scalar operands.
6893 if (!OpVT.isVector())
6894 continue;
6895 assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6895, __extension__
__PRETTY_FUNCTION__))
;
6896
6897 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6898 Op = BroadcastOp;
6899 continue;
6900 }
6901
6902 // Just widen the subvector by inserting into an undef wide vector.
6903 if (Widen)
6904 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6905 }
6906
6907 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6908
6909 // Perform the 512-bit op then extract the bottom subvector.
6910 if (Widen)
6911 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6912 return Res;
6913}
6914
6915/// Insert i1-subvector to i1-vector.
6916static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6917 const X86Subtarget &Subtarget) {
6918
6919 SDLoc dl(Op);
6920 SDValue Vec = Op.getOperand(0);
6921 SDValue SubVec = Op.getOperand(1);
6922 SDValue Idx = Op.getOperand(2);
6923 unsigned IdxVal = Op.getConstantOperandVal(2);
6924
6925 // Inserting undef is a nop. We can just return the original vector.
6926 if (SubVec.isUndef())
6927 return Vec;
6928
6929 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6930 return Op;
6931
6932 MVT OpVT = Op.getSimpleValueType();
6933 unsigned NumElems = OpVT.getVectorNumElements();
6934 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6935
6936 // Extend to natively supported kshift.
6937 MVT WideOpVT = OpVT;
6938 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6939 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6940
6941 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6942 // if necessary.
6943 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6944 // May need to promote to a legal type.
6945 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6946 DAG.getConstant(0, dl, WideOpVT),
6947 SubVec, Idx);
6948 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6949 }
6950
6951 MVT SubVecVT = SubVec.getSimpleValueType();
6952 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6953 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))
6954 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))
6955 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))
;
6956
6957 SDValue Undef = DAG.getUNDEF(WideOpVT);
6958
6959 if (IdxVal == 0) {
6960 // Zero lower bits of the Vec
6961 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6962 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6963 ZeroIdx);
6964 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6965 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6966 // Merge them together, SubVec should be zero extended.
6967 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6968 DAG.getConstant(0, dl, WideOpVT),
6969 SubVec, ZeroIdx);
6970 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6971 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6972 }
6973
6974 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6975 Undef, SubVec, ZeroIdx);
6976
6977 if (Vec.isUndef()) {
6978 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6978, __extension__
__PRETTY_FUNCTION__))
;
6979 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6980 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6981 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6982 }
6983
6984 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6985 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6985, __extension__
__PRETTY_FUNCTION__))
;
6986 // If upper elements of Vec are known undef, then just shift into place.
6987 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6988 [](SDValue V) { return V.isUndef(); })) {
6989 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6990 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6991 } else {
6992 NumElems = WideOpVT.getVectorNumElements();
6993 unsigned ShiftLeft = NumElems - SubVecNumElems;
6994 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6995 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6996 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6997 if (ShiftRight != 0)
6998 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6999 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7000 }
7001 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7002 }
7003
7004 // Simple case when we put subvector in the upper part
7005 if (IdxVal + SubVecNumElems == NumElems) {
7006 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7007 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
7008 if (SubVecNumElems * 2 == NumElems) {
7009 // Special case, use legal zero extending insert_subvector. This allows
7010 // isel to optimize when bits are known zero.
7011 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
7012 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7013 DAG.getConstant(0, dl, WideOpVT),
7014 Vec, ZeroIdx);
7015 } else {
7016 // Otherwise use explicit shifts to zero the bits.
7017 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7018 Undef, Vec, ZeroIdx);
7019 NumElems = WideOpVT.getVectorNumElements();
7020 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
7021 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
7022 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
7023 }
7024 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7025 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7026 }
7027
7028 // Inserting into the middle is more complicated.
7029
7030 NumElems = WideOpVT.getVectorNumElements();
7031
7032 // Widen the vector if needed.
7033 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
7034
7035 unsigned ShiftLeft = NumElems - SubVecNumElems;
7036 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
7037
7038 // Do an optimization for the the most frequently used types.
7039 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
7040 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
7041 Mask0.flipAllBits();
7042 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
7043 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
7044 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
7045 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7046 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7047 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7048 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7049 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7050
7051 // Reduce to original width if needed.
7052 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7053 }
7054
7055 // Clear the upper bits of the subvector and move it to its insert position.
7056 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7057 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7058 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7059 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7060
7061 // Isolate the bits below the insertion point.
7062 unsigned LowShift = NumElems - IdxVal;
7063 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
7064 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7065 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
7066 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7067
7068 // Isolate the bits after the last inserted bit.
7069 unsigned HighShift = IdxVal + SubVecNumElems;
7070 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
7071 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7072 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
7073 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7074
7075 // Now OR all 3 pieces together.
7076 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
7077 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
7078
7079 // Reduce to original width if needed.
7080 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7081}
7082
7083static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
7084 const SDLoc &dl) {
7085 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7085, __extension__
__PRETTY_FUNCTION__))
;
7086 EVT SubVT = V1.getValueType();
7087 EVT SubSVT = SubVT.getScalarType();
7088 unsigned SubNumElts = SubVT.getVectorNumElements();
7089 unsigned SubVectorWidth = SubVT.getSizeInBits();
7090 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
7091 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
7092 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
7093}
7094
7095/// Returns a vector of specified type with all bits set.
7096/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
7097/// Then bitcast to their original type, ensuring they get CSE'd.
7098static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
7099 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__
__PRETTY_FUNCTION__))
7100 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__
__PRETTY_FUNCTION__))
;
7101
7102 APInt Ones = APInt::getAllOnes(32);
7103 unsigned NumElts = VT.getSizeInBits() / 32;
7104 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
7105 return DAG.getBitcast(VT, Vec);
7106}
7107
7108static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
7109 SDValue In, SelectionDAG &DAG) {
7110 EVT InVT = In.getValueType();
7111 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__
__PRETTY_FUNCTION__))
;
7112 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))
7113 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))
7114 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))
;
7115
7116 // For 256-bit vectors, we only need the lower (128-bit) input half.
7117 // For 512-bit vectors, we only need the lower input half or quarter.
7118 if (InVT.getSizeInBits() > 128) {
7119 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__
__PRETTY_FUNCTION__))
7120 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__
__PRETTY_FUNCTION__))
;
7121 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
7122 In = extractSubVector(In, 0, DAG, DL,
7123 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
7124 InVT = In.getValueType();
7125 }
7126
7127 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
7128 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
7129
7130 return DAG.getNode(Opcode, DL, VT, In);
7131}
7132
7133// Match (xor X, -1) -> X.
7134// Match extract_subvector(xor X, -1) -> extract_subvector(X).
7135// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
7136static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
7137 V = peekThroughBitcasts(V);
7138 if (V.getOpcode() == ISD::XOR &&
7139 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
7140 isAllOnesConstant(V.getOperand(1))))
7141 return V.getOperand(0);
7142 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7143 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
7144 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
7145 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
7146 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
7147 Not, V.getOperand(1));
7148 }
7149 }
7150 SmallVector<SDValue, 2> CatOps;
7151 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
7152 for (SDValue &CatOp : CatOps) {
7153 SDValue NotCat = IsNOT(CatOp, DAG);
7154 if (!NotCat) return SDValue();
7155 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
7156 }
7157 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
7158 }
7159 return SDValue();
7160}
7161
7162void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
7163 bool Lo, bool Unary) {
7164 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__
__PRETTY_FUNCTION__))
7165 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__
__PRETTY_FUNCTION__))
;
7166 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7166, __extension__
__PRETTY_FUNCTION__))
;
7167 int NumElts = VT.getVectorNumElements();
7168 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7169 for (int i = 0; i < NumElts; ++i) {
7170 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7171 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
7172 Pos += (Unary ? 0 : NumElts * (i % 2));
7173 Pos += (Lo ? 0 : NumEltsInLane / 2);
7174 Mask.push_back(Pos);
7175 }
7176}
7177
7178/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
7179/// imposed by AVX and specific to the unary pattern. Example:
7180/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
7181/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
7182void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7183 bool Lo) {
7184 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7184, __extension__
__PRETTY_FUNCTION__))
;
7185 int NumElts = VT.getVectorNumElements();
7186 for (int i = 0; i < NumElts; ++i) {
7187 int Pos = i / 2;
7188 Pos += (Lo ? 0 : NumElts / 2);
7189 Mask.push_back(Pos);
7190 }
7191}
7192
7193// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
7194static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
7195 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
7196 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
7197 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
7198 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
7199 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
7200 int M = Mask[I];
7201 if (M < 0)
7202 continue;
7203 SDValue V = (M < NumElts) ? V1 : V2;
7204 if (V.isUndef())
7205 continue;
7206 Ops[I] = V.getOperand(M % NumElts);
7207 }
7208 return DAG.getBuildVector(VT, dl, Ops);
7209 }
7210
7211 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
7212}
7213
7214/// Returns a vector_shuffle node for an unpackl operation.
7215static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7216 SDValue V1, SDValue V2) {
7217 SmallVector<int, 8> Mask;
7218 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
7219 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7220}
7221
7222/// Returns a vector_shuffle node for an unpackh operation.
7223static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7224 SDValue V1, SDValue V2) {
7225 SmallVector<int, 8> Mask;
7226 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
7227 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7228}
7229
7230/// Returns a node that packs the LHS + RHS nodes together at half width.
7231/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
7232/// TODO: Add subvector splitting if/when we have a need for it.
7233static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
7234 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
7235 bool PackHiHalf = false) {
7236 MVT OpVT = LHS.getSimpleValueType();
7237 unsigned EltSizeInBits = VT.getScalarSizeInBits();
7238 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
7239 assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
7240 VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
7241 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
7242 "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
;
7243 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__
__PRETTY_FUNCTION__))
7244 "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__
__PRETTY_FUNCTION__))
;
7245
7246 // Rely on vector shuffles for vXi64 -> vXi32 packing.
7247 if (EltSizeInBits == 32) {
7248 SmallVector<int> PackMask;
7249 int Offset = PackHiHalf ? 1 : 0;
7250 int NumElts = VT.getVectorNumElements();
7251 for (int I = 0; I != NumElts; I += 4) {
7252 PackMask.push_back(I + Offset);
7253 PackMask.push_back(I + Offset + 2);
7254 PackMask.push_back(I + Offset + NumElts);
7255 PackMask.push_back(I + Offset + NumElts + 2);
7256 }
7257 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
7258 DAG.getBitcast(VT, RHS), PackMask);
7259 }
7260
7261 // See if we already have sufficient leading bits for PACKSS/PACKUS.
7262 if (!PackHiHalf) {
7263 if (UsePackUS &&
7264 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
7265 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
7266 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7267
7268 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
7269 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
7270 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7271 }
7272
7273 // Fallback to sign/zero extending the requested half and pack.
7274 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
7275 if (UsePackUS) {
7276 if (PackHiHalf) {
7277 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
7278 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
7279 } else {
7280 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
7281 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
7282 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
7283 };
7284 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7285 };
7286
7287 if (!PackHiHalf) {
7288 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
7289 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
7290 }
7291 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
7292 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
7293 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7294}
7295
7296/// Return a vector_shuffle of the specified vector of zero or undef vector.
7297/// This produces a shuffle where the low element of V2 is swizzled into the
7298/// zero/undef vector, landing at element Idx.
7299/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
7300static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
7301 bool IsZero,
7302 const X86Subtarget &Subtarget,
7303 SelectionDAG &DAG) {
7304 MVT VT = V2.getSimpleValueType();
7305 SDValue V1 = IsZero
7306 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
7307 int NumElems = VT.getVectorNumElements();
7308 SmallVector<int, 16> MaskVec(NumElems);
7309 for (int i = 0; i != NumElems; ++i)
7310 // If this is the insertion idx, put the low elt of V2 here.
7311 MaskVec[i] = (i == Idx) ? NumElems : i;
7312 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
7313}
7314
7315static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
7316 if (Ptr.getOpcode() == X86ISD::Wrapper ||
7317 Ptr.getOpcode() == X86ISD::WrapperRIP)
7318 Ptr = Ptr.getOperand(0);
7319
7320 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
7321 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
7322 return nullptr;
7323
7324 return CNode->getConstVal();
7325}
7326
7327static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
7328 if (!Load || !ISD::isNormalLoad(Load))
7329 return nullptr;
7330 return getTargetConstantFromBasePtr(Load->getBasePtr());
7331}
7332
7333static const Constant *getTargetConstantFromNode(SDValue Op) {
7334 Op = peekThroughBitcasts(Op);
7335 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
7336}
7337
7338const Constant *
7339X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7340 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7340, __extension__
__PRETTY_FUNCTION__))
;
7341 return getTargetConstantFromNode(LD);
7342}
7343
7344// Extract raw constant bits from constant pools.
7345static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7346 APInt &UndefElts,
7347 SmallVectorImpl<APInt> &EltBits,
7348 bool AllowWholeUndefs = true,
7349 bool AllowPartialUndefs = true) {
7350 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7350, __extension__
__PRETTY_FUNCTION__))
;
7351
7352 Op = peekThroughBitcasts(Op);
7353
7354 EVT VT = Op.getValueType();
7355 unsigned SizeInBits = VT.getSizeInBits();
7356 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7356, __extension__
__PRETTY_FUNCTION__))
;
7357 unsigned NumElts = SizeInBits / EltSizeInBits;
7358
7359 // Bitcast a source array of element bits to the target size.
7360 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7361 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7362 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7363 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__
__PRETTY_FUNCTION__))
7364 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__
__PRETTY_FUNCTION__))
;
7365
7366 // Don't split if we don't allow undef bits.
7367 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7368 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7369 return false;
7370
7371 // If we're already the right size, don't bother bitcasting.
7372 if (NumSrcElts == NumElts) {
7373 UndefElts = UndefSrcElts;
7374 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7375 return true;
7376 }
7377
7378 // Extract all the undef/constant element data and pack into single bitsets.
7379 APInt UndefBits(SizeInBits, 0);
7380 APInt MaskBits(SizeInBits, 0);
7381
7382 for (unsigned i = 0; i != NumSrcElts; ++i) {
7383 unsigned BitOffset = i * SrcEltSizeInBits;
7384 if (UndefSrcElts[i])
7385 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7386 MaskBits.insertBits(SrcEltBits[i], BitOffset);
7387 }
7388
7389 // Split the undef/constant single bitset data into the target elements.
7390 UndefElts = APInt(NumElts, 0);
7391 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7392
7393 for (unsigned i = 0; i != NumElts; ++i) {
7394 unsigned BitOffset = i * EltSizeInBits;
7395 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7396
7397 // Only treat an element as UNDEF if all bits are UNDEF.
7398 if (UndefEltBits.isAllOnes()) {
7399 if (!AllowWholeUndefs)
7400 return false;
7401 UndefElts.setBit(i);
7402 continue;
7403 }
7404
7405 // If only some bits are UNDEF then treat them as zero (or bail if not
7406 // supported).
7407 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7408 return false;
7409
7410 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7411 }
7412 return true;
7413 };
7414
7415 // Collect constant bits and insert into mask/undef bit masks.
7416 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7417 unsigned UndefBitIndex) {
7418 if (!Cst)
7419 return false;
7420 if (isa<UndefValue>(Cst)) {
7421 Undefs.setBit(UndefBitIndex);
7422 return true;
7423 }
7424 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7425 Mask = CInt->getValue();
7426 return true;
7427 }
7428 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7429 Mask = CFP->getValueAPF().bitcastToAPInt();
7430 return true;
7431 }
7432 return false;
7433 };
7434
7435 // Handle UNDEFs.
7436 if (Op.isUndef()) {
7437 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7438 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7439 return CastBitData(UndefSrcElts, SrcEltBits);
7440 }
7441
7442 // Extract scalar constant bits.
7443 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7444 APInt UndefSrcElts = APInt::getZero(1);
7445 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7446 return CastBitData(UndefSrcElts, SrcEltBits);
7447 }
7448 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7449 APInt UndefSrcElts = APInt::getZero(1);
7450 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7451 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7452 return CastBitData(UndefSrcElts, SrcEltBits);
7453 }
7454
7455 // Extract constant bits from build vector.
7456 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7457 BitVector Undefs;
7458 SmallVector<APInt> SrcEltBits;
7459 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7460 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7461 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
7462 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7463 if (Undefs[I])
7464 UndefSrcElts.setBit(I);
7465 return CastBitData(UndefSrcElts, SrcEltBits);
7466 }
7467 }
7468
7469 // Extract constant bits from constant pool vector.
7470 if (auto *Cst = getTargetConstantFromNode(Op)) {
7471 Type *CstTy = Cst->getType();
7472 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7473 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7474 return false;
7475
7476 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7477 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7478
7479 APInt UndefSrcElts(NumSrcElts, 0);
7480 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7481 for (unsigned i = 0; i != NumSrcElts; ++i)
7482 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7483 UndefSrcElts, i))
7484 return false;
7485
7486 return CastBitData(UndefSrcElts, SrcEltBits);
7487 }
7488
7489 // Extract constant bits from a broadcasted constant pool scalar.
7490 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7491 EltSizeInBits <= VT.getScalarSizeInBits()) {
7492 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7493 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7494 return false;
7495
7496 SDValue Ptr = MemIntr->getBasePtr();
7497 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7498 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7499 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7500
7501 APInt UndefSrcElts(NumSrcElts, 0);
7502 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7503 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7504 if (UndefSrcElts[0])
7505 UndefSrcElts.setBits(0, NumSrcElts);
7506 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7507 return CastBitData(UndefSrcElts, SrcEltBits);
7508 }
7509 }
7510 }
7511
7512 // Extract constant bits from a subvector broadcast.
7513 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7514 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7515 SDValue Ptr = MemIntr->getBasePtr();
7516 // The source constant may be larger than the subvector broadcast,
7517 // ensure we extract the correct subvector constants.
7518 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7519 Type *CstTy = Cst->getType();
7520 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7521 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7522 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7523 (SizeInBits % SubVecSizeInBits) != 0)
7524 return false;
7525 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7526 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7527 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7528 APInt UndefSubElts(NumSubElts, 0);
7529 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7530 APInt(CstEltSizeInBits, 0));
7531 for (unsigned i = 0; i != NumSubElts; ++i) {
7532 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7533 UndefSubElts, i))
7534 return false;
7535 for (unsigned j = 1; j != NumSubVecs; ++j)
7536 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7537 }
7538 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7539 UndefSubElts);
7540 return CastBitData(UndefSubElts, SubEltBits);
7541 }
7542 }
7543
7544 // Extract a rematerialized scalar constant insertion.
7545 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7546 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7547 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7548 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7549 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7550
7551 APInt UndefSrcElts(NumSrcElts, 0);
7552 SmallVector<APInt, 64> SrcEltBits;
7553 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7554 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7555 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7556 return CastBitData(UndefSrcElts, SrcEltBits);
7557 }
7558
7559 // Insert constant bits from a base and sub vector sources.
7560 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7561 // If bitcasts to larger elements we might lose track of undefs - don't
7562 // allow any to be safe.
7563 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7564 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7565
7566 APInt UndefSrcElts, UndefSubElts;
7567 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7568 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7569 UndefSubElts, EltSubBits,
7570 AllowWholeUndefs && AllowUndefs,
7571 AllowPartialUndefs && AllowUndefs) &&
7572 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7573 UndefSrcElts, EltSrcBits,
7574 AllowWholeUndefs && AllowUndefs,
7575 AllowPartialUndefs && AllowUndefs)) {
7576 unsigned BaseIdx = Op.getConstantOperandVal(2);
7577 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7578 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7579 EltSrcBits[BaseIdx + i] = EltSubBits[i];
7580 return CastBitData(UndefSrcElts, EltSrcBits);
7581 }
7582 }
7583
7584 // Extract constant bits from a subvector's source.
7585 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7586 // TODO - support extract_subvector through bitcasts.
7587 if (EltSizeInBits != VT.getScalarSizeInBits())
7588 return false;
7589
7590 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7591 UndefElts, EltBits, AllowWholeUndefs,
7592 AllowPartialUndefs)) {
7593 EVT SrcVT = Op.getOperand(0).getValueType();
7594 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7595 unsigned NumSubElts = VT.getVectorNumElements();
7596 unsigned BaseIdx = Op.getConstantOperandVal(1);
7597 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7598 if ((BaseIdx + NumSubElts) != NumSrcElts)
7599 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7600 if (BaseIdx != 0)
7601 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7602 return true;
7603 }
7604 }
7605
7606 // Extract constant bits from shuffle node sources.
7607 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7608 // TODO - support shuffle through bitcasts.
7609 if (EltSizeInBits != VT.getScalarSizeInBits())
7610 return false;
7611
7612 ArrayRef<int> Mask = SVN->getMask();
7613 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7614 llvm::any_of(Mask, [](int M) { return M < 0; }))
7615 return false;
7616
7617 APInt UndefElts0, UndefElts1;
7618 SmallVector<APInt, 32> EltBits0, EltBits1;
7619 if (isAnyInRange(Mask, 0, NumElts) &&
7620 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7621 UndefElts0, EltBits0, AllowWholeUndefs,
7622 AllowPartialUndefs))
7623 return false;
7624 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7625 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7626 UndefElts1, EltBits1, AllowWholeUndefs,
7627 AllowPartialUndefs))
7628 return false;
7629
7630 UndefElts = APInt::getZero(NumElts);
7631 for (int i = 0; i != (int)NumElts; ++i) {
7632 int M = Mask[i];
7633 if (M < 0) {
7634 UndefElts.setBit(i);
7635 EltBits.push_back(APInt::getZero(EltSizeInBits));
7636 } else if (M < (int)NumElts) {
7637 if (UndefElts0[M])
7638 UndefElts.setBit(i);
7639 EltBits.push_back(EltBits0[M]);
7640 } else {
7641 if (UndefElts1[M - NumElts])
7642 UndefElts.setBit(i);
7643 EltBits.push_back(EltBits1[M - NumElts]);
7644 }
7645 }
7646 return true;
7647 }
7648
7649 return false;
7650}
7651
7652namespace llvm {
7653namespace X86 {
7654bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7655 APInt UndefElts;
7656 SmallVector<APInt, 16> EltBits;
7657 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7658 UndefElts, EltBits, true,
7659 AllowPartialUndefs)) {
7660 int SplatIndex = -1;
7661 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7662 if (UndefElts[i])
7663 continue;
7664 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7665 SplatIndex = -1;
7666 break;
7667 }
7668 SplatIndex = i;
7669 }
7670 if (0 <= SplatIndex) {
7671 SplatVal = EltBits[SplatIndex];
7672 return true;
7673 }
7674 }
7675
7676 return false;
7677}
7678} // namespace X86
7679} // namespace llvm
7680
7681static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7682 unsigned MaskEltSizeInBits,
7683 SmallVectorImpl<uint64_t> &RawMask,
7684 APInt &UndefElts) {
7685 // Extract the raw target constant bits.
7686 SmallVector<APInt, 64> EltBits;
7687 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7688 EltBits, /* AllowWholeUndefs */ true,
7689 /* AllowPartialUndefs */ false))
7690 return false;
7691
7692 // Insert the extracted elements into the mask.
7693 for (const APInt &Elt : EltBits)
7694 RawMask.push_back(Elt.getZExtValue());
7695
7696 return true;
7697}
7698
7699/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7700/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7701/// Note: This ignores saturation, so inputs must be checked first.
7702static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7703 bool Unary, unsigned NumStages = 1) {
7704 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7704, __extension__
__PRETTY_FUNCTION__))
;
7705 unsigned NumElts = VT.getVectorNumElements();
7706 unsigned NumLanes = VT.getSizeInBits() / 128;
7707 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7708 unsigned Offset = Unary ? 0 : NumElts;
7709 unsigned Repetitions = 1u << (NumStages - 1);
7710 unsigned Increment = 1u << NumStages;
7711 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7711, __extension__
__PRETTY_FUNCTION__))
;
7712
7713 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7714 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7715 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7716 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7717 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7718 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7719 }
7720 }
7721}
7722
7723// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7724static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7725 APInt &DemandedLHS, APInt &DemandedRHS) {
7726 int NumLanes = VT.getSizeInBits() / 128;
7727 int NumElts = DemandedElts.getBitWidth();
7728 int NumInnerElts = NumElts / 2;
7729 int NumEltsPerLane = NumElts / NumLanes;
7730 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7731
7732 DemandedLHS = APInt::getZero(NumInnerElts);
7733 DemandedRHS = APInt::getZero(NumInnerElts);
7734
7735 // Map DemandedElts to the packed operands.
7736 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7737 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7738 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7739 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7740 if (DemandedElts[OuterIdx])
7741 DemandedLHS.setBit(InnerIdx);
7742 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7743 DemandedRHS.setBit(InnerIdx);
7744 }
7745 }
7746}
7747
7748// Split the demanded elts of a HADD/HSUB node between its operands.
7749static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7750 APInt &DemandedLHS, APInt &DemandedRHS) {
7751 int NumLanes = VT.getSizeInBits() / 128;
7752 int NumElts = DemandedElts.getBitWidth();
7753 int NumEltsPerLane = NumElts / NumLanes;
7754 int HalfEltsPerLane = NumEltsPerLane / 2;
7755
7756 DemandedLHS = APInt::getZero(NumElts);
7757 DemandedRHS = APInt::getZero(NumElts);
7758
7759 // Map DemandedElts to the horizontal operands.
7760 for (int Idx = 0; Idx != NumElts; ++Idx) {
7761 if (!DemandedElts[Idx])
7762 continue;
7763 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7764 int LocalIdx = Idx % NumEltsPerLane;
7765 if (LocalIdx < HalfEltsPerLane) {
7766 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7767 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7768 } else {
7769 LocalIdx -= HalfEltsPerLane;
7770 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7771 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7772 }
7773 }
7774}
7775
7776/// Calculates the shuffle mask corresponding to the target-specific opcode.
7777/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7778/// operands in \p Ops, and returns true.
7779/// Sets \p IsUnary to true if only one source is used. Note that this will set
7780/// IsUnary for shuffles which use a single input multiple times, and in those
7781/// cases it will adjust the mask to only have indices within that single input.
7782/// It is an error to call this with non-empty Mask/Ops vectors.
7783static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7784 SmallVectorImpl<SDValue> &Ops,
7785 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7786 unsigned NumElems = VT.getVectorNumElements();
7787 unsigned MaskEltSize = VT.getScalarSizeInBits();
7788 SmallVector<uint64_t, 32> RawMask;
7789 APInt RawUndefs;
7790 uint64_t ImmN;
7791
7792 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__
__PRETTY_FUNCTION__))
;
7793 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__))
;
7794
7795 IsUnary = false;
7796 bool IsFakeUnary = false;
7797 switch (N->getOpcode()) {
7798 case X86ISD::BLENDI:
7799 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__))
;
7800 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7800, __extension__
__PRETTY_FUNCTION__))
;
7801 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7802 DecodeBLENDMask(NumElems, ImmN, Mask);
7803 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7804 break;
7805 case X86ISD::SHUFP:
7806 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7806, __extension__
__PRETTY_FUNCTION__))
;
7807 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7807, __extension__
__PRETTY_FUNCTION__))
;
7808 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7809 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7810 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7811 break;
7812 case X86ISD::INSERTPS:
7813 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7813, __extension__
__PRETTY_FUNCTION__))
;
7814 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7814, __extension__
__PRETTY_FUNCTION__))
;
7815 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7816 DecodeINSERTPSMask(ImmN, Mask);
7817 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7818 break;
7819 case X86ISD::EXTRQI:
7820 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__
__PRETTY_FUNCTION__))
;
7821 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7822 isa<ConstantSDNode>(N->getOperand(2))) {
7823 int BitLen = N->getConstantOperandVal(1);
7824 int BitIdx = N->getConstantOperandVal(2);
7825 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7826 IsUnary = true;
7827 }
7828 break;
7829 case X86ISD::INSERTQI:
7830 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7830, __extension__
__PRETTY_FUNCTION__))
;
7831 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7831, __extension__
__PRETTY_FUNCTION__))
;
7832 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7833 isa<ConstantSDNode>(N->getOperand(3))) {
7834 int BitLen = N->getConstantOperandVal(2);
7835 int BitIdx = N->getConstantOperandVal(3);
7836 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7837 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7838 }
7839 break;
7840 case X86ISD::UNPCKH:
7841 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7841, __extension__
__PRETTY_FUNCTION__))
;
7842 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7842, __extension__
__PRETTY_FUNCTION__))
;
7843 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7844 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7845 break;
7846 case X86ISD::UNPCKL:
7847 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__))
;
7848 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7848, __extension__
__PRETTY_FUNCTION__))
;
7849 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7850 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7851 break;
7852 case X86ISD::MOVHLPS:
7853 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7853, __extension__
__PRETTY_FUNCTION__))
;
7854 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7854, __extension__
__PRETTY_FUNCTION__))
;
7855 DecodeMOVHLPSMask(NumElems, Mask);
7856 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7857 break;
7858 case X86ISD::MOVLHPS:
7859 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7859, __extension__
__PRETTY_FUNCTION__))
;
7860 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7860, __extension__
__PRETTY_FUNCTION__))
;
7861 DecodeMOVLHPSMask(NumElems, Mask);
7862 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7863 break;
7864 case X86ISD::VALIGN:
7865 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
7866 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
;
7867 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7867, __extension__
__PRETTY_FUNCTION__))
;
7868 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7868, __extension__
__PRETTY_FUNCTION__))
;
7869 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7870 DecodeVALIGNMask(NumElems, ImmN, Mask);
7871 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7872 Ops.push_back(N->getOperand(1));
7873 Ops.push_back(N->getOperand(0));
7874 break;
7875 case X86ISD::PALIGNR:
7876 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7876, __extension__
__PRETTY_FUNCTION__))
;
7877 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7877, __extension__
__PRETTY_FUNCTION__))
;
7878 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7878, __extension__
__PRETTY_FUNCTION__))
;
7879 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7880 DecodePALIGNRMask(NumElems, ImmN, Mask);
7881 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7882 Ops.push_back(N->getOperand(1));
7883 Ops.push_back(N->getOperand(0));
7884 break;
7885 case X86ISD::VSHLDQ:
7886 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__
__PRETTY_FUNCTION__))
;
7887 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7887, __extension__
__PRETTY_FUNCTION__))
;
7888 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7889 DecodePSLLDQMask(NumElems, ImmN, Mask);
7890 IsUnary = true;
7891 break;
7892 case X86ISD::VSRLDQ:
7893 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7893, __extension__
__PRETTY_FUNCTION__))
;
7894 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7894, __extension__
__PRETTY_FUNCTION__))
;
7895 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7896 DecodePSRLDQMask(NumElems, ImmN, Mask);
7897 IsUnary = true;
7898 break;
7899 case X86ISD::PSHUFD:
7900 case X86ISD::VPERMILPI:
7901 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7901, __extension__
__PRETTY_FUNCTION__))
;
7902 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7903 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7904 IsUnary = true;
7905 break;
7906 case X86ISD::PSHUFHW:
7907 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7907, __extension__
__PRETTY_FUNCTION__))
;
7908 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7909 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7910 IsUnary = true;
7911 break;
7912 case X86ISD::PSHUFLW:
7913 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__
__PRETTY_FUNCTION__))
;
7914 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7915 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7916 IsUnary = true;
7917 break;
7918 case X86ISD::VZEXT_MOVL:
7919 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7919, __extension__
__PRETTY_FUNCTION__))
;
7920 DecodeZeroMoveLowMask(NumElems, Mask);
7921 IsUnary = true;
7922 break;
7923 case X86ISD::VBROADCAST:
7924 // We only decode broadcasts of same-sized vectors, peeking through to
7925 // extracted subvectors is likely to cause hasOneUse issues with
7926 // SimplifyDemandedBits etc.
7927 if (N->getOperand(0).getValueType() == VT) {
7928 DecodeVectorBroadcast(NumElems, Mask);
7929 IsUnary = true;
7930 break;
7931 }
7932 return false;
7933 case X86ISD::VPERMILPV: {
7934 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7934, __extension__
__PRETTY_FUNCTION__))
;
7935 IsUnary = true;
7936 SDValue MaskNode = N->getOperand(1);
7937 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7938 RawUndefs)) {
7939 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7940 break;
7941 }
7942 return false;
7943 }
7944 case X86ISD::PSHUFB: {
7945 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__))
;
7946 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7946, __extension__
__PRETTY_FUNCTION__))
;
7947 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7947, __extension__
__PRETTY_FUNCTION__))
;
7948 IsUnary = true;
7949 SDValue MaskNode = N->getOperand(1);
7950 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7951 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7952 break;
7953 }
7954 return false;
7955 }
7956 case X86ISD::VPERMI:
7957 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__
__PRETTY_FUNCTION__))
;
7958 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7959 DecodeVPERMMask(NumElems, ImmN, Mask);
7960 IsUnary = true;
7961 break;
7962 case X86ISD::MOVSS:
7963 case X86ISD::MOVSD:
7964 case X86ISD::MOVSH:
7965 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7965, __extension__
__PRETTY_FUNCTION__))
;
7966 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7966, __extension__
__PRETTY_FUNCTION__))
;
7967 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7968 break;
7969 case X86ISD::VPERM2X128:
7970 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7970, __extension__
__PRETTY_FUNCTION__))
;
7971 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7971, __extension__
__PRETTY_FUNCTION__))
;
7972 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7973 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7974 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7975 break;
7976 case X86ISD::SHUF128:
7977 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7977, __extension__
__PRETTY_FUNCTION__))
;
7978 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__
__PRETTY_FUNCTION__))
;
7979 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7980 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7981 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7982 break;
7983 case X86ISD::MOVSLDUP:
7984 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7984, __extension__
__PRETTY_FUNCTION__))
;
7985 DecodeMOVSLDUPMask(NumElems, Mask);
7986 IsUnary = true;
7987 break;
7988 case X86ISD::MOVSHDUP:
7989 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7989, __extension__
__PRETTY_FUNCTION__))
;
7990 DecodeMOVSHDUPMask(NumElems, Mask);
7991 IsUnary = true;
7992 break;
7993 case X86ISD::MOVDDUP:
7994 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7994, __extension__
__PRETTY_FUNCTION__))
;
7995 DecodeMOVDDUPMask(NumElems, Mask);
7996 IsUnary = true;
7997 break;
7998 case X86ISD::VPERMIL2: {
7999 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7999, __extension__
__PRETTY_FUNCTION__))
;
8000 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8000, __extension__
__PRETTY_FUNCTION__))
;
8001 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8002 SDValue MaskNode = N->getOperand(2);
8003 SDValue CtrlNode = N->getOperand(3);
8004 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
8005 unsigned CtrlImm = CtrlOp->getZExtValue();
8006 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8007 RawUndefs)) {
8008 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
8009 Mask);
8010 break;
8011 }
8012 }
8013 return false;
8014 }
8015 case X86ISD::VPPERM: {
8016 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8016, __extension__
__PRETTY_FUNCTION__))
;
8017 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8017, __extension__
__PRETTY_FUNCTION__))
;
8018 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8019 SDValue MaskNode = N->getOperand(2);
8020 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
8021 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
8022 break;
8023 }
8024 return false;
8025 }
8026 case X86ISD::VPERMV: {
8027 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8027, __extension__
__PRETTY_FUNCTION__))
;
8028 IsUnary = true;
8029 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
8030 Ops.push_back(N->getOperand(1));
8031 SDValue MaskNode = N->getOperand(0);
8032 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8033 RawUndefs)) {
8034 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
8035 break;
8036 }
8037 return false;
8038 }
8039 case X86ISD::VPERMV3: {
8040 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8040, __extension__
__PRETTY_FUNCTION__))
;
8041 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8041, __extension__
__PRETTY_FUNCTION__))
;
8042 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
8043 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
8044 Ops.push_back(N->getOperand(0));
8045 Ops.push_back(N->getOperand(2));
8046 SDValue MaskNode = N->getOperand(1);
8047 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8048 RawUndefs)) {
8049 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
8050 break;
8051 }
8052 return false;
8053 }
8054 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8054)
;
8055 }
8056
8057 // Empty mask indicates the decode failed.
8058 if (Mask.empty())
8059 return false;
8060
8061 // Check if we're getting a shuffle mask with zero'd elements.
8062 if (!AllowSentinelZero && isAnyZero(Mask))
8063 return false;
8064
8065 // If we have a fake unary shuffle, the shuffle mask is spread across two
8066 // inputs that are actually the same node. Re-map the mask to always point
8067 // into the first input.
8068 if (IsFakeUnary)
8069 for (int &M : Mask)
8070 if (M >= (int)Mask.size())
8071 M -= Mask.size();
8072
8073 // If we didn't already add operands in the opcode-specific code, default to
8074 // adding 1 or 2 operands starting at 0.
8075 if (Ops.empty()) {
8076 Ops.push_back(N->getOperand(0));
8077 if (!IsUnary || IsFakeUnary)
8078 Ops.push_back(N->getOperand(1));
8079 }
8080
8081 return true;
8082}
8083
8084// Wrapper for getTargetShuffleMask with InUnary;
8085static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
8086 SmallVectorImpl<SDValue> &Ops,
8087 SmallVectorImpl<int> &Mask) {
8088 bool IsUnary;
8089 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
8090}
8091
8092/// Compute whether each element of a shuffle is zeroable.
8093///
8094/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8095/// Either it is an undef element in the shuffle mask, the element of the input
8096/// referenced is undef, or the element of the input referenced is known to be
8097/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8098/// as many lanes with this technique as possible to simplify the remaining
8099/// shuffle.
8100static void computeZeroableShuffleElements(ArrayRef<int> Mask,
8101 SDValue V1, SDValue V2,
8102 APInt &KnownUndef, APInt &KnownZero) {
8103 int Size = Mask.size();
8104 KnownUndef = KnownZero = APInt::getZero(Size);
8105
8106 V1 = peekThroughBitcasts(V1);
8107 V2 = peekThroughBitcasts(V2);
8108
8109 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8110 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8111
8112 int VectorSizeInBits = V1.getValueSizeInBits();
8113 int ScalarSizeInBits = VectorSizeInBits / Size;
8114 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8114, __extension__
__PRETTY_FUNCTION__))
;
8115
8116 for (int i = 0; i < Size; ++i) {
8117 int M = Mask[i];
8118 // Handle the easy cases.
8119 if (M < 0) {
8120 KnownUndef.setBit(i);
8121 continue;
8122 }
8123 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8124 KnownZero.setBit(i);
8125 continue;
8126 }
8127
8128 // Determine shuffle input and normalize the mask.
8129 SDValue V = M < Size ? V1 : V2;
8130 M %= Size;
8131
8132 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8133 if (V.getOpcode() != ISD::BUILD_VECTOR)
8134 continue;
8135
8136 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8137 // the (larger) source element must be UNDEF/ZERO.
8138 if ((Size % V.getNumOperands()) == 0) {
8139 int Scale = Size / V->getNumOperands();
8140 SDValue Op = V.getOperand(M / Scale);
8141 if (Op.isUndef())
8142 KnownUndef.setBit(i);
8143 if (X86::isZeroNode(Op))
8144 KnownZero.setBit(i);
8145 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8146 APInt Val = Cst->getAPIntValue();
8147 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8148 if (Val == 0)
8149 KnownZero.setBit(i);
8150 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8151 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8152 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8153 if (Val == 0)
8154 KnownZero.setBit(i);
8155 }
8156 continue;
8157 }
8158
8159 // If the BUILD_VECTOR has more elements then all the (smaller) source
8160 // elements must be UNDEF or ZERO.
8161 if ((V.getNumOperands() % Size) == 0) {
8162 int Scale = V->getNumOperands() / Size;
8163 bool AllUndef = true;
8164 bool AllZero = true;
8165 for (int j = 0; j < Scale; ++j) {
8166 SDValue Op = V.getOperand((M * Scale) + j);
8167 AllUndef &= Op.isUndef();
8168 AllZero &= X86::isZeroNode(Op);
8169 }
8170 if (AllUndef)
8171 KnownUndef.setBit(i);
8172 if (AllZero)
8173 KnownZero.setBit(i);
8174 continue;
8175 }
8176 }
8177}
8178
8179/// Decode a target shuffle mask and inputs and see if any values are
8180/// known to be undef or zero from their inputs.
8181/// Returns true if the target shuffle mask was decoded.
8182/// FIXME: Merge this with computeZeroableShuffleElements?
8183static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
8184 SmallVectorImpl<SDValue> &Ops,
8185 APInt &KnownUndef, APInt &KnownZero) {
8186 bool IsUnary;
8187 if (!isTargetShuffle(N.getOpcode()))
8188 return false;
8189
8190 MVT VT = N.getSimpleValueType();
8191 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
8192 return false;
8193
8194 int Size = Mask.size();
8195 SDValue V1 = Ops[0];
8196 SDValue V2 = IsUnary ? V1 : Ops[1];
8197 KnownUndef = KnownZero = APInt::getZero(Size);
8198
8199 V1 = peekThroughBitcasts(V1);
8200 V2 = peekThroughBitcasts(V2);
8201
8202 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__
__PRETTY_FUNCTION__))
8203 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__
__PRETTY_FUNCTION__))
;
8204 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
8205
8206 // Extract known constant input data.
8207 APInt UndefSrcElts[2];
8208 SmallVector<APInt, 32> SrcEltBits[2];
8209 bool IsSrcConstant[2] = {
8210 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
8211 SrcEltBits[0], true, false),
8212 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
8213 SrcEltBits[1], true, false)};
8214
8215 for (int i = 0; i < Size; ++i) {
8216 int M = Mask[i];
8217
8218 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
8219 if (M < 0) {
8220 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8220, __extension__
__PRETTY_FUNCTION__))
;
8221 if (SM_SentinelUndef == M)
8222 KnownUndef.setBit(i);
8223 if (SM_SentinelZero == M)
8224 KnownZero.setBit(i);
8225 continue;
8226 }
8227
8228 // Determine shuffle input and normalize the mask.
8229 unsigned SrcIdx = M / Size;
8230 SDValue V = M < Size ? V1 : V2;
8231 M %= Size;
8232
8233 // We are referencing an UNDEF input.
8234 if (V.isUndef()) {
8235 KnownUndef.setBit(i);
8236 continue;
8237 }
8238
8239 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
8240 // TODO: We currently only set UNDEF for integer types - floats use the same
8241 // registers as vectors and many of the scalar folded loads rely on the
8242 // SCALAR_TO_VECTOR pattern.
8243 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
8244 (Size % V.getValueType().getVectorNumElements()) == 0) {
8245 int Scale = Size / V.getValueType().getVectorNumElements();
8246 int Idx = M / Scale;
8247 if (Idx != 0 && !VT.isFloatingPoint())
8248 KnownUndef.setBit(i);
8249 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
8250 KnownZero.setBit(i);
8251 continue;
8252 }
8253
8254 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
8255 // base vectors.
8256 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
8257 SDValue Vec = V.getOperand(0);
8258 int NumVecElts = Vec.getValueType().getVectorNumElements();
8259 if (Vec.isUndef() && Size == NumVecElts) {
8260 int Idx = V.getConstantOperandVal(2);
8261 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
8262 if (M < Idx || (Idx + NumSubElts) <= M)
8263 KnownUndef.setBit(i);
8264 }
8265 continue;
8266 }
8267
8268 // Attempt to extract from the source's constant bits.
8269 if (IsSrcConstant[SrcIdx]) {
8270 if (UndefSrcElts[SrcIdx][M])
8271 KnownUndef.setBit(i);
8272 else if (SrcEltBits[SrcIdx][M] == 0)
8273 KnownZero.setBit(i);
8274 }
8275 }
8276
8277 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__
__PRETTY_FUNCTION__))
8278 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__
__PRETTY_FUNCTION__))
;
8279 return true;
8280}
8281
8282// Replace target shuffle mask elements with known undef/zero sentinels.
8283static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
8284 const APInt &KnownUndef,
8285 const APInt &KnownZero,
8286 bool ResolveKnownZeros= true) {
8287 unsigned NumElts = Mask.size();
8288 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__
__PRETTY_FUNCTION__))
8289 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__
__PRETTY_FUNCTION__))
;
8290
8291 for (unsigned i = 0; i != NumElts; ++i) {
8292 if (KnownUndef[i])
8293 Mask[i] = SM_SentinelUndef;
8294 else if (ResolveKnownZeros && KnownZero[i])
8295 Mask[i] = SM_SentinelZero;
8296 }
8297}
8298
8299// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
8300static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
8301 APInt &KnownUndef,
8302 APInt &KnownZero) {
8303 unsigned NumElts = Mask.size();
8304 KnownUndef = KnownZero = APInt::getZero(NumElts);
8305
8306 for (unsigned i = 0; i != NumElts; ++i) {
8307 int M = Mask[i];
8308 if (SM_SentinelUndef == M)
8309 KnownUndef.setBit(i);
8310 if (SM_SentinelZero == M)
8311 KnownZero.setBit(i);
8312 }
8313}
8314
8315// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
8316static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
8317 SDValue Cond, bool IsBLENDV = false) {
8318 EVT CondVT = Cond.getValueType();
8319 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
8320 unsigned NumElts = CondVT.getVectorNumElements();
8321
8322 APInt UndefElts;
8323 SmallVector<APInt, 32> EltBits;
8324 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
8325 true, false))
8326 return false;
8327
8328 Mask.resize(NumElts, SM_SentinelUndef);
8329
8330 for (int i = 0; i != (int)NumElts; ++i) {
8331 Mask[i] = i;
8332 // Arbitrarily choose from the 2nd operand if the select condition element
8333 // is undef.
8334 // TODO: Can we do better by matching patterns such as even/odd?
8335 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
8336 (IsBLENDV && EltBits[i].isNonNegative()))
8337 Mask[i] += NumElts;
8338 }
8339
8340 return true;
8341}
8342
8343// Forward declaration (for getFauxShuffleMask recursive check).
8344static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8345 SmallVectorImpl<SDValue> &Inputs,
8346 SmallVectorImpl<int> &Mask,
8347 const SelectionDAG &DAG, unsigned Depth,
8348 bool ResolveKnownElts);
8349
8350// Attempt to decode ops that could be represented as a shuffle mask.
8351// The decoded shuffle mask may contain a different number of elements to the
8352// destination value type.
8353// TODO: Merge into getTargetShuffleInputs()
8354static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
8355 SmallVectorImpl<int> &Mask,
8356 SmallVectorImpl<SDValue> &Ops,
8357 const SelectionDAG &DAG, unsigned Depth,
8358 bool ResolveKnownElts) {
8359 Mask.clear();
8360 Ops.clear();
8361
8362 MVT VT = N.getSimpleValueType();
8363 unsigned NumElts = VT.getVectorNumElements();
8364 unsigned NumSizeInBits = VT.getSizeInBits();
8365 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8366 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8367 return false;
8368 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8368, __extension__
__PRETTY_FUNCTION__))
;
8369 unsigned NumSizeInBytes = NumSizeInBits / 8;
8370 unsigned NumBytesPerElt = NumBitsPerElt / 8;
8371
8372 unsigned Opcode = N.getOpcode();
8373 switch (Opcode) {
8374 case ISD::VECTOR_SHUFFLE: {
8375 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8376 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8377 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8378 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8379 Ops.push_back(N.getOperand(0));
8380 Ops.push_back(N.getOperand(1));
8381 return true;
8382 }
8383 return false;
8384 }
8385 case ISD::AND:
8386 case X86ISD::ANDNP: {
8387 // Attempt to decode as a per-byte mask.
8388 APInt UndefElts;
8389 SmallVector<APInt, 32> EltBits;
8390 SDValue N0 = N.getOperand(0);
8391 SDValue N1 = N.getOperand(1);
8392 bool IsAndN = (X86ISD::ANDNP == Opcode);
8393 uint64_t ZeroMask = IsAndN ? 255 : 0;
8394 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8395 return false;
8396 // We can't assume an undef src element gives an undef dst - the other src
8397 // might be zero.
8398 if (!UndefElts.isZero())
8399 return false;
8400 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8401 const APInt &ByteBits = EltBits[i];
8402 if (ByteBits != 0 && ByteBits != 255)
8403 return false;
8404 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8405 }
8406 Ops.push_back(IsAndN ? N1 : N0);
8407 return true;
8408 }
8409 case ISD::OR: {
8410 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8411 // is a valid shuffle index.
8412 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8413 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8414 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8415 return false;
8416
8417 SmallVector<int, 64> SrcMask0, SrcMask1;
8418 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8419 APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
8420 APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
8421 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
8422 Depth + 1, true) ||
8423 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
8424 Depth + 1, true))
8425 return false;
8426
8427 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8428 SmallVector<int, 64> Mask0, Mask1;
8429 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8430 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8431 for (int i = 0; i != (int)MaskSize; ++i) {
8432 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8433 // loops converting between OR and BLEND shuffles due to
8434 // canWidenShuffleElements merging away undef elements, meaning we
8435 // fail to recognise the OR as the undef element isn't known zero.
8436 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8437 Mask.push_back(SM_SentinelZero);
8438 else if (Mask1[i] == SM_SentinelZero)
8439 Mask.push_back(i);
8440 else if (Mask0[i] == SM_SentinelZero)
8441 Mask.push_back(i + MaskSize);
8442 else
8443 return false;
8444 }
8445 Ops.push_back(N0);
8446 Ops.push_back(N1);
8447 return true;
8448 }
8449 case ISD::INSERT_SUBVECTOR: {
8450 SDValue Src = N.getOperand(0);
8451 SDValue Sub = N.getOperand(1);
8452 EVT SubVT = Sub.getValueType();
8453 unsigned NumSubElts = SubVT.getVectorNumElements();
8454 if (!N->isOnlyUserOf(Sub.getNode()))
8455 return false;
8456 uint64_t InsertIdx = N.getConstantOperandVal(2);
8457 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8458 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8459 Sub.getOperand(0).getValueType() == VT) {
8460 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8461 for (int i = 0; i != (int)NumElts; ++i)
8462 Mask.push_back(i);
8463 for (int i = 0; i != (int)NumSubElts; ++i)
8464 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8465 Ops.push_back(Src);
8466 Ops.push_back(Sub.getOperand(0));
8467 return true;
8468 }
8469 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8470 SmallVector<int, 64> SubMask;
8471 SmallVector<SDValue, 2> SubInputs;
8472 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
8473 EVT SubSrcVT = SubSrc.getValueType();
8474 if (!SubSrcVT.isVector())
8475 return false;
8476
8477 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
8478 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
8479 Depth + 1, ResolveKnownElts))
8480 return false;
8481
8482 // Subvector shuffle inputs must not be larger than the subvector.
8483 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8484 return SubVT.getFixedSizeInBits() <
8485 SubInput.getValueSizeInBits().getFixedValue();
8486 }))
8487 return false;
8488
8489 if (SubMask.size() != NumSubElts) {
8490 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__
__PRETTY_FUNCTION__))
8491 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__
__PRETTY_FUNCTION__))
;
8492 if ((NumSubElts % SubMask.size()) == 0) {
8493 int Scale = NumSubElts / SubMask.size();
8494 SmallVector<int,64> ScaledSubMask;
8495 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8496 SubMask = ScaledSubMask;
8497 } else {
8498 int Scale = SubMask.size() / NumSubElts;
8499 NumSubElts = SubMask.size();
8500 NumElts *= Scale;
8501 InsertIdx *= Scale;
8502 }
8503 }
8504 Ops.push_back(Src);
8505 Ops.append(SubInputs.begin(), SubInputs.end());
8506 if (ISD::isBuildVectorAllZeros(Src.getNode()))
8507 Mask.append(NumElts, SM_SentinelZero);
8508 else
8509 for (int i = 0; i != (int)NumElts; ++i)
8510 Mask.push_back(i);
8511 for (int i = 0; i != (int)NumSubElts; ++i) {
8512 int M = SubMask[i];
8513 if (0 <= M) {
8514 int InputIdx = M / NumSubElts;
8515 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8516 }
8517 Mask[i + InsertIdx] = M;
8518 }
8519 return true;
8520 }
8521 case X86ISD::PINSRB:
8522 case X86ISD::PINSRW:
8523 case ISD::SCALAR_TO_VECTOR:
8524 case ISD::INSERT_VECTOR_ELT: {
8525 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8526 // vector, for matching src/dst vector types.
8527 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8528
8529 unsigned DstIdx = 0;
8530 if (Opcode != ISD::SCALAR_TO_VECTOR) {
8531 // Check we have an in-range constant insertion index.
8532 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8533 N.getConstantOperandAPInt(2).uge(NumElts))
8534 return false;
8535 DstIdx = N.getConstantOperandVal(2);
8536
8537 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8538 if (X86::isZeroNode(Scl)) {
8539 Ops.push_back(N.getOperand(0));
8540 for (unsigned i = 0; i != NumElts; ++i)
8541 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8542 return true;
8543 }
8544 }
8545
8546 // Peek through trunc/aext/zext.
8547 // TODO: aext shouldn't require SM_SentinelZero padding.
8548 // TODO: handle shift of scalars.
8549 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8550 while (Scl.getOpcode() == ISD::TRUNCATE ||
8551 Scl.getOpcode() == ISD::ANY_EXTEND ||
8552 Scl.getOpcode() == ISD::ZERO_EXTEND) {
8553 Scl = Scl.getOperand(0);
8554 MinBitsPerElt =
8555 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8556 }
8557 if ((MinBitsPerElt % 8) != 0)
8558 return false;
8559
8560 // Attempt to find the source vector the scalar was extracted from.
8561 SDValue SrcExtract;
8562 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8563 Scl.getOpcode() == X86ISD::PEXTRW ||
8564 Scl.getOpcode() == X86ISD::PEXTRB) &&
8565 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8566 SrcExtract = Scl;
8567 }
8568 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8569 return false;
8570
8571 SDValue SrcVec = SrcExtract.getOperand(0);
8572 EVT SrcVT = SrcVec.getValueType();
8573 if (!SrcVT.getScalarType().isByteSized())
8574 return false;
8575 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8576 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8577 unsigned DstByte = DstIdx * NumBytesPerElt;
8578 MinBitsPerElt =
8579 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8580
8581 // Create 'identity' byte level shuffle mask and then add inserted bytes.
8582 if (Opcode == ISD::SCALAR_TO_VECTOR) {
8583 Ops.push_back(SrcVec);
8584 Mask.append(NumSizeInBytes, SM_SentinelUndef);
8585 } else {
8586 Ops.push_back(SrcVec);
8587 Ops.push_back(N.getOperand(0));
8588 for (int i = 0; i != (int)NumSizeInBytes; ++i)
8589 Mask.push_back(NumSizeInBytes + i);
8590 }
8591
8592 unsigned MinBytesPerElts = MinBitsPerElt / 8;
8593 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8594 for (unsigned i = 0; i != MinBytesPerElts; ++i)
8595 Mask[DstByte + i] = SrcByte + i;
8596 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8597 Mask[DstByte + i] = SM_SentinelZero;
8598 return true;
8599 }
8600 case X86ISD::PACKSS:
8601 case X86ISD::PACKUS: {
8602 SDValue N0 = N.getOperand(0);
8603 SDValue N1 = N.getOperand(1);
8604 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))
8605 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))
8606 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))
;
8607
8608 APInt EltsLHS, EltsRHS;
8609 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8610
8611 // If we know input saturation won't happen (or we don't care for particular
8612 // lanes), we can treat this as a truncation shuffle.
8613 bool Offset0 = false, Offset1 = false;
8614 if (Opcode == X86ISD::PACKSS) {
8615 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8616 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8617 (!(N1.isUndef() || EltsRHS.isZero()) &&
8618 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8619 return false;
8620 // We can't easily fold ASHR into a shuffle, but if it was feeding a
8621 // PACKSS then it was likely being used for sign-extension for a
8622 // truncation, so just peek through and adjust the mask accordingly.
8623 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8624 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8625 Offset0 = true;
8626 N0 = N0.getOperand(0);
8627 }
8628 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8629 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8630 Offset1 = true;
8631 N1 = N1.getOperand(0);
8632 }
8633 } else {
8634 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8635 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8636 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8637 (!(N1.isUndef() || EltsRHS.isZero()) &&
8638 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8639 return false;
8640 }
8641
8642 bool IsUnary = (N0 == N1);
8643
8644 Ops.push_back(N0);
8645 if (!IsUnary)
8646 Ops.push_back(N1);
8647
8648 createPackShuffleMask(VT, Mask, IsUnary);
8649
8650 if (Offset0 || Offset1) {
8651 for (int &M : Mask)
8652 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8653 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8654 ++M;
8655 }
8656 return true;
8657 }
8658 case ISD::VSELECT:
8659 case X86ISD::BLENDV: {
8660 SDValue Cond = N.getOperand(0);
8661 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
8662 Ops.push_back(N.getOperand(1));
8663 Ops.push_back(N.getOperand(2));
8664 return true;
8665 }
8666 return false;
8667 }
8668 case X86ISD::VTRUNC: {
8669 SDValue Src = N.getOperand(0);
8670 EVT SrcVT = Src.getValueType();
8671 // Truncated source must be a simple vector.
8672 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8673 (SrcVT.getScalarSizeInBits() % 8) != 0)
8674 return false;
8675 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8676 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8677 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8678 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8678, __extension__
__PRETTY_FUNCTION__))
;
8679 for (unsigned i = 0; i != NumSrcElts; ++i)
8680 Mask.push_back(i * Scale);
8681 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8682 Ops.push_back(Src);
8683 return true;
8684 }
8685 case X86ISD::VSHLI:
8686 case X86ISD::VSRLI: {
8687 uint64_t ShiftVal = N.getConstantOperandVal(1);
8688 // Out of range bit shifts are guaranteed to be zero.
8689 if (NumBitsPerElt <= ShiftVal) {
8690 Mask.append(NumElts, SM_SentinelZero);
8691 return true;
8692 }
8693
8694 // We can only decode 'whole byte' bit shifts as shuffles.
8695 if ((ShiftVal % 8) != 0)
8696 break;
8697
8698 uint64_t ByteShift = ShiftVal / 8;
8699 Ops.push_back(N.getOperand(0));
8700
8701 // Clear mask to all zeros and insert the shifted byte indices.
8702 Mask.append(NumSizeInBytes, SM_SentinelZero);
8703
8704 if (X86ISD::VSHLI == Opcode) {
8705 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8706 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8707 Mask[i + j] = i + j - ByteShift;
8708 } else {
8709 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8710 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8711 Mask[i + j - ByteShift] = i + j;
8712 }
8713 return true;
8714 }
8715 case X86ISD::VROTLI:
8716 case X86ISD::VROTRI: {
8717 // We can only decode 'whole byte' bit rotates as shuffles.
8718 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8719 if ((RotateVal % 8) != 0)
8720 return false;
8721 Ops.push_back(N.getOperand(0));
8722 int Offset = RotateVal / 8;
8723 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8724 for (int i = 0; i != (int)NumElts; ++i) {
8725 int BaseIdx = i * NumBytesPerElt;
8726 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8727 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8728 }
8729 }
8730 return true;
8731 }
8732 case X86ISD::VBROADCAST: {
8733 SDValue Src = N.getOperand(0);
8734 if (!Src.getSimpleValueType().isVector()) {
8735 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8736 !isNullConstant(Src.getOperand(1)) ||
8737 Src.getOperand(0).getValueType().getScalarType() !=
8738 VT.getScalarType())
8739 return false;
8740 Src = Src.getOperand(0);
8741 }
8742 Ops.push_back(Src);
8743 Mask.append(NumElts, 0);
8744 return true;
8745 }
8746 case ISD::ZERO_EXTEND:
8747 case ISD::ANY_EXTEND:
8748 case ISD::ZERO_EXTEND_VECTOR_INREG:
8749 case ISD::ANY_EXTEND_VECTOR_INREG: {
8750 SDValue Src = N.getOperand(0);
8751 EVT SrcVT = Src.getValueType();
8752
8753 // Extended source must be a simple vector.
8754 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8755 (SrcVT.getScalarSizeInBits() % 8) != 0)
8756 return false;
8757
8758 bool IsAnyExtend =
8759 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8760 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8761 IsAnyExtend, Mask);
8762 Ops.push_back(Src);
8763 return true;
8764 }
8765 }
8766
8767 return false;
8768}
8769
8770/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8771static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8772 SmallVectorImpl<int> &Mask) {
8773 int MaskWidth = Mask.size();
8774 SmallVector<SDValue, 16> UsedInputs;
8775 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8776 int lo = UsedInputs.size() * MaskWidth;
8777 int hi = lo + MaskWidth;
8778
8779 // Strip UNDEF input usage.
8780 if (Inputs[i].isUndef())
8781 for (int &M : Mask)
8782 if ((lo <= M) && (M < hi))
8783 M = SM_SentinelUndef;
8784
8785 // Check for unused inputs.
8786 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8787 for (int &M : Mask)
8788 if (lo <= M)
8789 M -= MaskWidth;
8790 continue;
8791 }
8792
8793 // Check for repeated inputs.
8794 bool IsRepeat = false;
8795 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8796 if (UsedInputs[j] != Inputs[i])
8797 continue;
8798 for (int &M : Mask)
8799 if (lo <= M)
8800 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8801 IsRepeat = true;
8802 break;
8803 }
8804 if (IsRepeat)
8805 continue;
8806
8807 UsedInputs.push_back(Inputs[i]);
8808 }
8809 Inputs = UsedInputs;
8810}
8811
8812/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8813/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8814/// Returns true if the target shuffle mask was decoded.
8815static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8816 SmallVectorImpl<SDValue> &Inputs,
8817 SmallVectorImpl<int> &Mask,
8818 APInt &KnownUndef, APInt &KnownZero,
8819 const SelectionDAG &DAG, unsigned Depth,
8820 bool ResolveKnownElts) {
8821 if (Depth >= SelectionDAG::MaxRecursionDepth)
8822 return false; // Limit search depth.
8823
8824 EVT VT = Op.getValueType();
8825 if (!VT.isSimple() || !VT.isVector())
8826 return false;
8827
8828 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8829 if (ResolveKnownElts)
8830 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8831 return true;
8832 }
8833 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8834 ResolveKnownElts)) {
8835 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8836 return true;
8837 }
8838 return false;
8839}
8840
8841static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8842 SmallVectorImpl<SDValue> &Inputs,
8843 SmallVectorImpl<int> &Mask,
8844 const SelectionDAG &DAG, unsigned Depth,
8845 bool ResolveKnownElts) {
8846 APInt KnownUndef, KnownZero;
8847 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8848 KnownZero, DAG, Depth, ResolveKnownElts);
8849}
8850
8851static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8852 SmallVectorImpl<int> &Mask,
8853 const SelectionDAG &DAG, unsigned Depth = 0,
8854 bool ResolveKnownElts = true) {
8855 EVT VT = Op.getValueType();
8856 if (!VT.isSimple() || !VT.isVector())
8857 return false;
8858
8859 unsigned NumElts = Op.getValueType().getVectorNumElements();
8860 APInt DemandedElts = APInt::getAllOnes(NumElts);
8861 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
8862 ResolveKnownElts);
8863}
8864
8865// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8866static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8867 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8868 SelectionDAG &DAG) {
8869 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))
8870 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))
8871 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))
;
8872
8873 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8874 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8875 return SDValue();
8876
8877 SDValue Ptr =
8878 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8879 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8880 SDValue Ops[] = {Mem->getChain(), Ptr};
8881 SDValue BcstLd = DAG.getMemIntrinsicNode(
8882 Opcode, DL, Tys, Ops, MemVT,
8883 DAG.getMachineFunction().getMachineMemOperand(
8884 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8885 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8886 return BcstLd;
8887}
8888
8889/// Returns the scalar element that will make up the i'th
8890/// element of the result of the vector shuffle.
8891static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8892 SelectionDAG &DAG, unsigned Depth) {
8893 if (Depth >= SelectionDAG::MaxRecursionDepth)
8894 return SDValue(); // Limit search depth.
8895
8896 EVT VT = Op.getValueType();
8897 unsigned Opcode = Op.getOpcode();
8898 unsigned NumElems = VT.getVectorNumElements();
8899
8900 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8901 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8902 int Elt = SV->getMaskElt(Index);
8903
8904 if (Elt < 0)
8905 return DAG.getUNDEF(VT.getVectorElementType());
8906
8907 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8908 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8909 }
8910
8911 // Recurse into target specific vector shuffles to find scalars.
8912 if (isTargetShuffle(Opcode)) {
8913 MVT ShufVT = VT.getSimpleVT();
8914 MVT ShufSVT = ShufVT.getVectorElementType();
8915 int NumElems = (int)ShufVT.getVectorNumElements();
8916 SmallVector<int, 16> ShuffleMask;
8917 SmallVector<SDValue, 16> ShuffleOps;
8918 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8919 ShuffleMask))
8920 return SDValue();
8921
8922 int Elt = ShuffleMask[Index];
8923 if (Elt == SM_SentinelZero)
8924 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8925 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8926 if (Elt == SM_SentinelUndef)
8927 return DAG.getUNDEF(ShufSVT);
8928
8929 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8929, __extension__
__PRETTY_FUNCTION__))
;
8930 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8931 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8932 }
8933
8934 // Recurse into insert_subvector base/sub vector to find scalars.
8935 if (Opcode == ISD::INSERT_SUBVECTOR) {
8936 SDValue Vec = Op.getOperand(0);
8937 SDValue Sub = Op.getOperand(1);
8938 uint64_t SubIdx = Op.getConstantOperandVal(2);
8939 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8940
8941 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8942 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8943 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8944 }
8945
8946 // Recurse into concat_vectors sub vector to find scalars.
8947 if (Opcode == ISD::CONCAT_VECTORS) {
8948 EVT SubVT = Op.getOperand(0).getValueType();
8949 unsigned NumSubElts = SubVT.getVectorNumElements();
8950 uint64_t SubIdx = Index / NumSubElts;
8951 uint64_t SubElt = Index % NumSubElts;
8952 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8953 }
8954
8955 // Recurse into extract_subvector src vector to find scalars.
8956 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8957 SDValue Src = Op.getOperand(0);
8958 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8959 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8960 }
8961
8962 // We only peek through bitcasts of the same vector width.
8963 if (Opcode == ISD::BITCAST) {
8964 SDValue Src = Op.getOperand(0);
8965 EVT SrcVT = Src.getValueType();
8966 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8967 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8968 return SDValue();
8969 }
8970
8971 // Actual nodes that may contain scalar elements
8972
8973 // For insert_vector_elt - either return the index matching scalar or recurse
8974 // into the base vector.
8975 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8976 isa<ConstantSDNode>(Op.getOperand(2))) {
8977 if (Op.getConstantOperandAPInt(2) == Index)
8978 return Op.getOperand(1);
8979 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8980 }
8981
8982 if (Opcode == ISD::SCALAR_TO_VECTOR)
8983 return (Index == 0) ? Op.getOperand(0)
8984 : DAG.getUNDEF(VT.getVectorElementType());
8985
8986 if (Opcode == ISD::BUILD_VECTOR)
8987 return Op.getOperand(Index);
8988
8989 return SDValue();
8990}
8991
8992// Use PINSRB/PINSRW/PINSRD to create a build vector.
8993static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8994 unsigned NumNonZero, unsigned NumZero,
8995 SelectionDAG &DAG,
8996 const X86Subtarget &Subtarget) {
8997 MVT VT = Op.getSimpleValueType();
8998 unsigned NumElts = VT.getVectorNumElements();
8999 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))
9000 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))
9001 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))
;
9002
9003 SDLoc dl(Op);
9004 SDValue V;
9005 bool First = true;
9006
9007 for (unsigned i = 0; i < NumElts; ++i) {
9008 bool IsNonZero = NonZeroMask[i];
9009 if (!IsNonZero)
9010 continue;
9011
9012 // If the build vector contains zeros or our first insertion is not the
9013 // first index then insert into zero vector to break any register
9014 // dependency else use SCALAR_TO_VECTOR.
9015 if (First) {
9016 First = false;
9017 if (NumZero || 0 != i)
9018 V = getZeroVector(VT, Subtarget, DAG, dl);
9019 else {
9020 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9020, __extension__
__PRETTY_FUNCTION__))
;
9021 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9022 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
9023 V = DAG.getBitcast(VT, V);
9024 continue;
9025 }
9026 }
9027 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
9028 DAG.getIntPtrConstant(i, dl));
9029 }
9030
9031 return V;
9032}
9033
9034/// Custom lower build_vector of v16i8.
9035static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
9036 unsigned NumNonZero, unsigned NumZero,
9037 SelectionDAG &DAG,
9038 const X86Subtarget &Subtarget) {
9039 if (NumNonZero > 8 && !Subtarget.hasSSE41())
9040 return SDValue();
9041
9042 // SSE4.1 - use PINSRB to insert each byte directly.
9043 if (Subtarget.hasSSE41())
9044 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9045 Subtarget);
9046
9047 SDLoc dl(Op);
9048 SDValue V;
9049
9050 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
9051 for (unsigned i = 0; i < 16; i += 2) {
9052 bool ThisIsNonZero = NonZeroMask[i];
9053 bool NextIsNonZero = NonZeroMask[i + 1];
9054 if (!ThisIsNonZero && !NextIsNonZero)
9055 continue;
9056
9057 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
9058 SDValue Elt;
9059 if (ThisIsNonZero) {
9060 if (NumZero || NextIsNonZero)
9061 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9062 else
9063 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9064 }
9065
9066 if (NextIsNonZero) {
9067 SDValue NextElt = Op.getOperand(i + 1);
9068 if (i == 0 && NumZero)
9069 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
9070 else
9071 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
9072 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
9073 DAG.getConstant(8, dl, MVT::i8));
9074 if (ThisIsNonZero)
9075 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
9076 else
9077 Elt = NextElt;
9078 }
9079
9080 // If our first insertion is not the first index or zeros are needed, then
9081 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
9082 // elements undefined).
9083 if (!V) {
9084 if (i != 0 || NumZero)
9085 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
9086 else {
9087 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
9088 V = DAG.getBitcast(MVT::v8i16, V);
9089 continue;
9090 }
9091 }
9092 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
9093 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
9094 DAG.getIntPtrConstant(i / 2, dl));
9095 }
9096
9097 return DAG.getBitcast(MVT::v16i8, V);
9098}
9099
9100/// Custom lower build_vector of v8i16.
9101static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
9102 unsigned NumNonZero, unsigned NumZero,
9103 SelectionDAG &DAG,
9104 const X86Subtarget &Subtarget) {
9105 if (NumNonZero > 4 && !Subtarget.hasSSE41())
9106 return SDValue();
9107
9108 // Use PINSRW to insert each byte directly.
9109 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9110 Subtarget);
9111}
9112
9113/// Custom lower build_vector of v4i32 or v4f32.
9114static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
9115 const X86Subtarget &Subtarget) {
9116 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
9117 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
9118 // Because we're creating a less complicated build vector here, we may enable
9119 // further folding of the MOVDDUP via shuffle transforms.
9120 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
9121 Op.getOperand(0) == Op.getOperand(2) &&
9122 Op.getOperand(1) == Op.getOperand(3) &&
9123 Op.getOperand(0) != Op.getOperand(1)) {
9124 SDLoc DL(Op);
9125 MVT VT = Op.getSimpleValueType();
9126 MVT EltVT = VT.getVectorElementType();
9127 // Create a new build vector with the first 2 elements followed by undef
9128 // padding, bitcast to v2f64, duplicate, and bitcast back.
9129 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9130 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9131 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
9132 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
9133 return DAG.getBitcast(VT, Dup);
9134 }
9135
9136 // Find all zeroable elements.
9137 std::bitset<4> Zeroable, Undefs;
9138 for (int i = 0; i < 4; ++i) {
9139 SDValue Elt = Op.getOperand(i);
9140 Undefs[i] = Elt.isUndef();
9141 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
9142 }
9143 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__
__PRETTY_FUNCTION__))
9144 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__
__PRETTY_FUNCTION__))
;
9145
9146 // We only know how to deal with build_vector nodes where elements are either
9147 // zeroable or extract_vector_elt with constant index.
9148 SDValue FirstNonZero;
9149 unsigned FirstNonZeroIdx;
9150 for (unsigned i = 0; i < 4; ++i) {
9151 if (Zeroable[i])
9152 continue;
9153 SDValue Elt = Op.getOperand(i);
9154 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9155 !isa<ConstantSDNode>(Elt.getOperand(1)))
9156 return SDValue();
9157 // Make sure that this node is extracting from a 128-bit vector.
9158 MVT VT = Elt.getOperand(0).getSimpleValueType();
9159 if (!VT.is128BitVector())
9160 return SDValue();
9161 if (!FirstNonZero.getNode()) {
9162 FirstNonZero = Elt;
9163 FirstNonZeroIdx = i;
9164 }
9165 }
9166
9167 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9167, __extension__
__PRETTY_FUNCTION__))
;
9168 SDValue V1 = FirstNonZero.getOperand(0);
9169 MVT VT = V1.getSimpleValueType();
9170
9171 // See if this build_vector can be lowered as a blend with zero.
9172 SDValue Elt;
9173 unsigned EltMaskIdx, EltIdx;
9174 int Mask[4];
9175 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
9176 if (Zeroable[EltIdx]) {
9177 // The zero vector will be on the right hand side.
9178 Mask[EltIdx] = EltIdx+4;
9179 continue;
9180 }
9181
9182 Elt = Op->getOperand(EltIdx);
9183 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
9184 EltMaskIdx = Elt.getConstantOperandVal(1);
9185 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
9186 break;
9187 Mask[EltIdx] = EltIdx;
9188 }
9189
9190 if (EltIdx == 4) {
9191 // Let the shuffle legalizer deal with blend operations.
9192 SDValue VZeroOrUndef = (Zeroable == Undefs)
9193 ? DAG.getUNDEF(VT)
9194 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
9195 if (V1.getSimpleValueType() != VT)
9196 V1 = DAG.getBitcast(VT, V1);
9197 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
9198 }
9199
9200 // See if we can lower this build_vector to a INSERTPS.
9201 if (!Subtarget.hasSSE41())
9202 return SDValue();
9203
9204 SDValue V2 = Elt.getOperand(0);
9205 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
9206 V1 = SDValue();
9207
9208 bool CanFold = true;
9209 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
9210 if (Zeroable[i])
9211 continue;
9212
9213 SDValue Current = Op->getOperand(i);
9214 SDValue SrcVector = Current->getOperand(0);
9215 if (!V1.getNode())
9216 V1 = SrcVector;
9217 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
9218 }
9219
9220 if (!CanFold)
9221 return SDValue();
9222
9223 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__
__PRETTY_FUNCTION__))
;
9224 if (V1.getSimpleValueType() != MVT::v4f32)
9225 V1 = DAG.getBitcast(MVT::v4f32, V1);
9226 if (V2.getSimpleValueType() != MVT::v4f32)
9227 V2 = DAG.getBitcast(MVT::v4f32, V2);
9228
9229 // Ok, we can emit an INSERTPS instruction.
9230 unsigned ZMask = Zeroable.to_ulong();
9231
9232 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
9233 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9233, __extension__
__PRETTY_FUNCTION__))
;
9234 SDLoc DL(Op);
9235 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9236 DAG.getIntPtrConstant(InsertPSMask, DL, true));
9237 return DAG.getBitcast(VT, Result);
9238}
9239
9240/// Return a vector logical shift node.
9241static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
9242 SelectionDAG &DAG, const TargetLowering &TLI,
9243 const SDLoc &dl) {
9244 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9244, __extension__
__PRETTY_FUNCTION__))
;
9245 MVT ShVT = MVT::v16i8;
9246 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
9247 SrcOp = DAG.getBitcast(ShVT, SrcOp);
9248 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9248, __extension__
__PRETTY_FUNCTION__))
;
9249 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
9250 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
9251}
9252
9253static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
9254 SelectionDAG &DAG) {
9255
9256 // Check if the scalar load can be widened into a vector load. And if
9257 // the address is "base + cst" see if the cst can be "absorbed" into
9258 // the shuffle mask.
9259 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
9260 SDValue Ptr = LD->getBasePtr();
9261 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
9262 return SDValue();
9263 EVT PVT = LD->getValueType(0);
9264 if (PVT != MVT::i32 && PVT != MVT::f32)
9265 return SDValue();
9266
9267 int FI = -1;
9268 int64_t Offset = 0;
9269 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
9270 FI = FINode->getIndex();
9271 Offset = 0;
9272 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
9273 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
9274 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
9275 Offset = Ptr.getConstantOperandVal(1);
9276 Ptr = Ptr.getOperand(0);
9277 } else {
9278 return SDValue();
9279 }
9280
9281 // FIXME: 256-bit vector instructions don't require a strict alignment,
9282 // improve this code to support it better.
9283 Align RequiredAlign(VT.getSizeInBits() / 8);
9284 SDValue Chain = LD->getChain();
9285 // Make sure the stack object alignment is at least 16 or 32.
9286 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9287 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
9288 if (!InferredAlign || *InferredAlign < RequiredAlign) {
9289 if (MFI.isFixedObjectIndex(FI)) {
9290 // Can't change the alignment. FIXME: It's possible to compute
9291 // the exact stack offset and reference FI + adjust offset instead.
9292 // If someone *really* cares about this. That's the way to implement it.
9293 return SDValue();
9294 } else {
9295 MFI.setObjectAlignment(FI, RequiredAlign);
9296 }
9297 }
9298
9299 // (Offset % 16 or 32) must be multiple of 4. Then address is then
9300 // Ptr + (Offset & ~15).
9301 if (Offset < 0)
9302 return SDValue();
9303 if ((Offset % RequiredAlign.value()) & 3)
9304 return SDValue();
9305 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
9306 if (StartOffset) {
9307 SDLoc DL(Ptr);
9308 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9309 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
9310 }
9311
9312 int EltNo = (Offset - StartOffset) >> 2;
9313 unsigned NumElems = VT.getVectorNumElements();
9314
9315 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
9316 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
9317 LD->getPointerInfo().getWithOffset(StartOffset));
9318
9319 SmallVector<int, 8> Mask(NumElems, EltNo);
9320
9321 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
9322 }
9323
9324 return SDValue();
9325}
9326
9327// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
9328static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
9329 if (ISD::isNON_EXTLoad(Elt.getNode())) {
9330 auto *BaseLd = cast<LoadSDNode>(Elt);
9331 if (!BaseLd->isSimple())
9332 return false;
9333 Ld = BaseLd;
9334 ByteOffset = 0;
9335 return true;
9336 }
9337
9338 switch (Elt.getOpcode()) {
9339 case ISD::BITCAST:
9340 case ISD::TRUNCATE:
9341 case ISD::SCALAR_TO_VECTOR:
9342 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
9343 case ISD::SRL:
9344 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9345 uint64_t Amt = AmtC->getZExtValue();
9346 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
9347 ByteOffset += Amt / 8;
9348 return true;
9349 }
9350 }
9351 break;
9352 case ISD::EXTRACT_VECTOR_ELT:
9353 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9354 SDValue Src = Elt.getOperand(0);
9355 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
9356 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
9357 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
9358 findEltLoadSrc(Src, Ld, ByteOffset)) {
9359 uint64_t Idx = IdxC->getZExtValue();
9360 ByteOffset += Idx * (SrcSizeInBits / 8);
9361 return true;
9362 }
9363 }
9364 break;
9365 }
9366
9367 return false;
9368}
9369
9370/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
9371/// elements can be replaced by a single large load which has the same value as
9372/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
9373///
9374/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
9375static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
9376 const SDLoc &DL, SelectionDAG &DAG,
9377 const X86Subtarget &Subtarget,
9378 bool IsAfterLegalize) {
9379 if ((VT.getScalarSizeInBits() % 8) != 0)
9380 return SDValue();
9381
9382 unsigned NumElems = Elts.size();
9383
9384 int LastLoadedElt = -1;
9385 APInt LoadMask = APInt::getZero(NumElems);
9386 APInt ZeroMask = APInt::getZero(NumElems);
9387 APInt UndefMask = APInt::getZero(NumElems);
9388
9389 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
9390 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
9391
9392 // For each element in the initializer, see if we've found a load, zero or an
9393 // undef.
9394 for (unsigned i = 0; i < NumElems; ++i) {
9395 SDValue Elt = peekThroughBitcasts(Elts[i]);
9396 if (!Elt.getNode())
9397 return SDValue();
9398 if (Elt.isUndef()) {
9399 UndefMask.setBit(i);
9400 continue;
9401 }
9402 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9403 ZeroMask.setBit(i);
9404 continue;
9405 }
9406
9407 // Each loaded element must be the correct fractional portion of the
9408 // requested vector load.
9409 unsigned EltSizeInBits = Elt.getValueSizeInBits();
9410 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9411 return SDValue();
9412
9413 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9414 return SDValue();
9415 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9416 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9417 return SDValue();
9418
9419 LoadMask.setBit(i);
9420 LastLoadedElt = i;
9421 }
9422 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
9423 NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
9424 "Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
;
9425
9426 // Handle Special Cases - all undef or undef/zero.
9427 if (UndefMask.popcount() == NumElems)
9428 return DAG.getUNDEF(VT);
9429 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
9430 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9431 : DAG.getConstantFP(0.0, DL, VT);
9432
9433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9434 int FirstLoadedElt = LoadMask.countr_zero();
9435 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9436 EVT EltBaseVT = EltBase.getValueType();
9437 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__
__PRETTY_FUNCTION__))
9438 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__
__PRETTY_FUNCTION__))
;
9439 LoadSDNode *LDBase = Loads[FirstLoadedElt];
9440 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9440, __extension__
__PRETTY_FUNCTION__))
;
9441 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9442 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9443 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9444 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9445 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9445, __extension__
__PRETTY_FUNCTION__))
;
9446
9447 // TODO: Support offsetting the base load.
9448 if (ByteOffsets[FirstLoadedElt] != 0)
9449 return SDValue();
9450
9451 // Check to see if the element's load is consecutive to the base load
9452 // or offset from a previous (already checked) load.
9453 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9454 LoadSDNode *Ld = Loads[EltIdx];
9455 int64_t ByteOffset = ByteOffsets[EltIdx];
9456 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9457 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9458 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9459 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9460 }
9461 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9462 EltIdx - FirstLoadedElt);
9463 };
9464
9465 // Consecutive loads can contain UNDEFS but not ZERO elements.
9466 // Consecutive loads with UNDEFs and ZEROs elements require a
9467 // an additional shuffle stage to clear the ZERO elements.
9468 bool IsConsecutiveLoad = true;
9469 bool IsConsecutiveLoadWithZeros = true;
9470 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9471 if (LoadMask[i]) {
9472 if (!CheckConsecutiveLoad(LDBase, i)) {
9473 IsConsecutiveLoad = false;
9474 IsConsecutiveLoadWithZeros = false;
9475 break;
9476 }
9477 } else if (ZeroMask[i]) {
9478 IsConsecutiveLoad = false;
9479 }
9480 }
9481
9482 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9483 auto MMOFlags = LDBase->getMemOperand()->getFlags();
9484 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__
__PRETTY_FUNCTION__))
9485 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__
__PRETTY_FUNCTION__))
;
9486 SDValue NewLd =
9487 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9488 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9489 MMOFlags);
9490 for (auto *LD : Loads)
9491 if (LD)
9492 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9493 return NewLd;
9494 };
9495
9496 // Check if the base load is entirely dereferenceable.
9497 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9498 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9499
9500 // LOAD - all consecutive load/undefs (must start/end with a load or be
9501 // entirely dereferenceable). If we have found an entire vector of loads and
9502 // undefs, then return a large load of the entire vector width starting at the
9503 // base pointer. If the vector contains zeros, then attempt to shuffle those
9504 // elements.
9505 if (FirstLoadedElt == 0 &&
9506 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9507 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9508 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9509 return SDValue();
9510
9511 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9512 // will lower to regular temporal loads and use the cache.
9513 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
9514 VT.is256BitVector() && !Subtarget.hasInt256())
9515 return SDValue();
9516
9517 if (NumElems == 1)
9518 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9519
9520 if (!ZeroMask)
9521 return CreateLoad(VT, LDBase);
9522
9523 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9524 // vector and a zero vector to clear out the zero elements.
9525 if (!IsAfterLegalize && VT.isVector()) {
9526 unsigned NumMaskElts = VT.getVectorNumElements();
9527 if ((NumMaskElts % NumElems) == 0) {
9528 unsigned Scale = NumMaskElts / NumElems;
9529 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9530 for (unsigned i = 0; i < NumElems; ++i) {
9531 if (UndefMask[i])
9532 continue;
9533 int Offset = ZeroMask[i] ? NumMaskElts : 0;
9534 for (unsigned j = 0; j != Scale; ++j)
9535 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9536 }
9537 SDValue V = CreateLoad(VT, LDBase);
9538 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9539 : DAG.getConstantFP(0.0, DL, VT);
9540 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9541 }
9542 }
9543 }
9544
9545 // If the upper half of a ymm/zmm load is undef then just load the lower half.
9546 if (VT.is256BitVector() || VT.is512BitVector()) {
9547 unsigned HalfNumElems = NumElems / 2;
9548 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9549 EVT HalfVT =
9550 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9551 SDValue HalfLD =
9552 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9553 DAG, Subtarget, IsAfterLegalize);
9554 if (HalfLD)
9555 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9556 HalfLD, DAG.getIntPtrConstant(0, DL));
9557 }
9558 }
9559
9560 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9561 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9562 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9563 LoadSizeInBits == 64) &&
9564 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9565 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9566 : MVT::getIntegerVT(LoadSizeInBits);
9567 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9568 // Allow v4f32 on SSE1 only targets.
9569 // FIXME: Add more isel patterns so we can just use VT directly.
9570 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9571 VecVT = MVT::v4f32;
9572 if (TLI.isTypeLegal(VecVT)) {
9573 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9574 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9575 SDValue ResNode = DAG.getMemIntrinsicNode(
9576 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9577 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9578 for (auto *LD : Loads)
9579 if (LD)
9580 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9581 return DAG.getBitcast(VT, ResNode);
9582 }
9583 }
9584
9585 // BROADCAST - match the smallest possible repetition pattern, load that
9586 // scalar/subvector element and then broadcast to the entire vector.
9587 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9588 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9589 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9590 unsigned RepeatSize = SubElems * BaseSizeInBits;
9591 unsigned ScalarSize = std::min(RepeatSize, 64u);
9592 if (!Subtarget.hasAVX2() && ScalarSize < 32)
9593 continue;
9594
9595 // Don't attempt a 1:N subvector broadcast - it should be caught by
9596 // combineConcatVectorOps, else will cause infinite loops.
9597 if (RepeatSize > ScalarSize && SubElems == 1)
9598 continue;
9599
9600 bool Match = true;
9601 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9602 for (unsigned i = 0; i != NumElems && Match; ++i) {
9603 if (!LoadMask[i])
9604 continue;
9605 SDValue Elt = peekThroughBitcasts(Elts[i]);
9606 if (RepeatedLoads[i % SubElems].isUndef())
9607 RepeatedLoads[i % SubElems] = Elt;
9608 else
9609 Match &= (RepeatedLoads[i % SubElems] == Elt);
9610 }
9611
9612 // We must have loads at both ends of the repetition.
9613 Match &= !RepeatedLoads.front().isUndef();
9614 Match &= !RepeatedLoads.back().isUndef();
9615 if (!Match)
9616 continue;
9617
9618 EVT RepeatVT =
9619 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9620 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9621 : EVT::getFloatingPointVT(ScalarSize);
9622 if (RepeatSize > ScalarSize)
9623 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9624 RepeatSize / ScalarSize);
9625 EVT BroadcastVT =
9626 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9627 VT.getSizeInBits() / ScalarSize);
9628 if (TLI.isTypeLegal(BroadcastVT)) {
9629 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9630 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9631 SDValue Broadcast = RepeatLoad;
9632 if (RepeatSize > ScalarSize) {
9633 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9634 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9635 } else {
9636 if (!Subtarget.hasAVX2() &&
9637 !X86::mayFoldLoadIntoBroadcastFromMem(
9638 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9639 Subtarget,
9640 /*AssumeSingleUse=*/true))
9641 return SDValue();
9642 Broadcast =
9643 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9644 }
9645 return DAG.getBitcast(VT, Broadcast);
9646 }
9647 }
9648 }
9649 }
9650
9651 return SDValue();
9652}
9653
9654// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9655// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9656// are consecutive, non-overlapping, and in the right order.
9657static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9658 SelectionDAG &DAG,
9659 const X86Subtarget &Subtarget,
9660 bool IsAfterLegalize) {
9661 SmallVector<SDValue, 64> Elts;
9662 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9663 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9664 Elts.push_back(Elt);
9665 continue;
9666 }
9667 return SDValue();
9668 }
9669 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9669, __extension__
__PRETTY_FUNCTION__))
;
9670 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9671 IsAfterLegalize);
9672}
9673
9674static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9675 unsigned SplatBitSize, LLVMContext &C) {
9676 unsigned ScalarSize = VT.getScalarSizeInBits();
9677 unsigned NumElm = SplatBitSize / ScalarSize;
9678
9679 SmallVector<Constant *, 32> ConstantVec;
9680 for (unsigned i = 0; i < NumElm; i++) {
9681 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9682 Constant *Const;
9683 if (VT.isFloatingPoint()) {
9684 if (ScalarSize == 16) {
9685 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9686 } else if (ScalarSize == 32) {
9687 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9688 } else {
9689 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__
__PRETTY_FUNCTION__))
;
9690 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9691 }
9692 } else
9693 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9694 ConstantVec.push_back(Const);
9695 }
9696 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9697}
9698
9699static bool isFoldableUseOfShuffle(SDNode *N) {
9700 for (auto *U : N->uses()) {
9701 unsigned Opc = U->getOpcode();
9702 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9703 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9704 return false;
9705 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9706 return false;
9707 if (isTargetShuffle(Opc))
9708 return true;
9709 if (Opc == ISD::BITCAST) // Ignore bitcasts
9710 return isFoldableUseOfShuffle(U);
9711 if (N->hasOneUse()) {
9712 // TODO, there may be some general way to know if a SDNode can
9713 // be folded. We now only know whether an MI is foldable.
9714 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9715 return false;
9716 return true;
9717 }
9718 }
9719 return false;
9720}
9721
9722/// Attempt to use the vbroadcast instruction to generate a splat value
9723/// from a splat BUILD_VECTOR which uses:
9724/// a. A single scalar load, or a constant.
9725/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9726///
9727/// The VBROADCAST node is returned when a pattern is found,
9728/// or SDValue() otherwise.
9729static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9730 const X86Subtarget &Subtarget,
9731 SelectionDAG &DAG) {
9732 // VBROADCAST requires AVX.
9733 // TODO: Splats could be generated for non-AVX CPUs using SSE
9734 // instructions, but there's less potential gain for only 128-bit vectors.
9735 if (!Subtarget.hasAVX())
9736 return SDValue();
9737
9738 MVT VT = BVOp->getSimpleValueType(0);
9739 unsigned NumElts = VT.getVectorNumElements();
9740 SDLoc dl(BVOp);
9741
9742 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__
__PRETTY_FUNCTION__))
9743 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__
__PRETTY_FUNCTION__))
;
9744
9745 // See if the build vector is a repeating sequence of scalars (inc. splat).
9746 SDValue Ld;
9747 BitVector UndefElements;
9748 SmallVector<SDValue, 16> Sequence;
9749 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9750 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9750, __extension__
__PRETTY_FUNCTION__))
;
9751 if (Sequence.size() == 1)
9752 Ld = Sequence[0];
9753 }
9754
9755 // Attempt to use VBROADCASTM
9756 // From this pattern:
9757 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9758 // b. t1 = (build_vector t0 t0)
9759 //
9760 // Create (VBROADCASTM v2i1 X)
9761 if (!Sequence.empty() && Subtarget.hasCDI()) {
9762 // If not a splat, are the upper sequence values zeroable?
9763 unsigned SeqLen = Sequence.size();
9764 bool UpperZeroOrUndef =
9765 SeqLen == 1 ||
9766 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
9767 return !V || V.isUndef() || isNullConstant(V);
9768 });
9769 SDValue Op0 = Sequence[0];
9770 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9771 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9772 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9773 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9774 ? Op0.getOperand(0)
9775 : Op0.getOperand(0).getOperand(0);
9776 MVT MaskVT = BOperand.getSimpleValueType();
9777 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9778 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9779 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9780 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9781 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9782 unsigned Scale = 512 / VT.getSizeInBits();
9783 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9784 }
9785 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9786 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9787 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9788 return DAG.getBitcast(VT, Bcst);
9789 }
9790 }
9791 }
9792
9793 unsigned NumUndefElts = UndefElements.count();
9794 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9795 APInt SplatValue, Undef;
9796 unsigned SplatBitSize;
9797 bool HasUndef;
9798 // Check if this is a repeated constant pattern suitable for broadcasting.
9799 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9800 SplatBitSize > VT.getScalarSizeInBits() &&
9801 SplatBitSize < VT.getSizeInBits()) {
9802 // Avoid replacing with broadcast when it's a use of a shuffle
9803 // instruction to preserve the present custom lowering of shuffles.
9804 if (isFoldableUseOfShuffle(BVOp))
9805 return SDValue();
9806 // replace BUILD_VECTOR with broadcast of the repeated constants.
9807 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9808 LLVMContext *Ctx = DAG.getContext();
9809 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9810 if (Subtarget.hasAVX()) {
9811 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9812 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9813 // Splatted value can fit in one INTEGER constant in constant pool.
9814 // Load the constant and broadcast it.
9815 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9816 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9817 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9818 SDValue CP = DAG.getConstantPool(C, PVT);
9819 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9820
9821 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9822 SDVTList Tys =
9823 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9824 SDValue Ops[] = {DAG.getEntryNode(), CP};
9825 MachinePointerInfo MPI =
9826 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9827 SDValue Brdcst = DAG.getMemIntrinsicNode(
9828 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9829 MachineMemOperand::MOLoad);
9830 return DAG.getBitcast(VT, Brdcst);
9831 }
9832 if (SplatBitSize > 64) {
9833 // Load the vector of constants and broadcast it.
9834 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9835 *Ctx);
9836 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9837 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9838 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9839 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9840 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9841 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9842 MachinePointerInfo MPI =
9843 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9844 return DAG.getMemIntrinsicNode(
9845 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9846 MachineMemOperand::MOLoad);
9847 }
9848 }
9849 }
9850
9851 // If we are moving a scalar into a vector (Ld must be set and all elements
9852 // but 1 are undef) and that operation is not obviously supported by
9853 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9854 // That's better than general shuffling and may eliminate a load to GPR and
9855 // move from scalar to vector register.
9856 if (!Ld || NumElts - NumUndefElts != 1)
9857 return SDValue();
9858 unsigned ScalarSize = Ld.getValueSizeInBits();
9859 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9860 return SDValue();
9861 }
9862
9863 bool ConstSplatVal =
9864 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9865 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9866
9867 // TODO: Handle broadcasts of non-constant sequences.
9868
9869 // Make sure that all of the users of a non-constant load are from the
9870 // BUILD_VECTOR node.
9871 // FIXME: Is the use count needed for non-constant, non-load case?
9872 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9873 return SDValue();
9874
9875 unsigned ScalarSize = Ld.getValueSizeInBits();
9876 bool IsGE256 = (VT.getSizeInBits() >= 256);
9877
9878 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9879 // instruction to save 8 or more bytes of constant pool data.
9880 // TODO: If multiple splats are generated to load the same constant,
9881 // it may be detrimental to overall size. There needs to be a way to detect
9882 // that condition to know if this is truly a size win.
9883 bool OptForSize = DAG.shouldOptForSize();
9884
9885 // Handle broadcasting a single constant scalar from the constant pool
9886 // into a vector.
9887 // On Sandybridge (no AVX2), it is still better to load a constant vector
9888 // from the constant pool and not to broadcast it from a scalar.
9889 // But override that restriction when optimizing for size.
9890 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9891 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9892 EVT CVT = Ld.getValueType();
9893 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9893, __extension__
__PRETTY_FUNCTION__))
;
9894
9895 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
9896 // For size optimization, also splat v2f64 and v2i64, and for size opt
9897 // with AVX2, also splat i8 and i16.
9898 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9899 if (ScalarSize == 32 ||
9900 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9901 CVT == MVT::f16 ||
9902 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9903 const Constant *C = nullptr;
9904 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9905 C = CI->getConstantIntValue();
9906 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9907 C = CF->getConstantFPValue();
9908
9909 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9909, __extension__
__PRETTY_FUNCTION__))
;
9910
9911 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9912 SDValue CP =
9913 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9914 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9915
9916 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9917 SDValue Ops[] = {DAG.getEntryNode(), CP};
9918 MachinePointerInfo MPI =
9919 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9920 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9921 MPI, Alignment, MachineMemOperand::MOLoad);
9922 }
9923 }
9924
9925 // Handle AVX2 in-register broadcasts.
9926 if (!IsLoad && Subtarget.hasInt256() &&
9927 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9928 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9929
9930 // The scalar source must be a normal load.
9931 if (!IsLoad)
9932 return SDValue();
9933
9934 // Make sure the non-chain result is only used by this build vector.
9935 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9936 return SDValue();
9937
9938 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9939 (Subtarget.hasVLX() && ScalarSize == 64)) {
9940 auto *LN = cast<LoadSDNode>(Ld);
9941 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9942 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9943 SDValue BCast =
9944 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9945 LN->getMemoryVT(), LN->getMemOperand());
9946 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9947 return BCast;
9948 }
9949
9950 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9951 // double since there is no vbroadcastsd xmm
9952 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9953 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9954 auto *LN = cast<LoadSDNode>(Ld);
9955 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9956 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9957 SDValue BCast =
9958 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9959 LN->getMemoryVT(), LN->getMemOperand());
9960 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9961 return BCast;
9962 }
9963
9964 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9965 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9966
9967 // Unsupported broadcast.
9968 return SDValue();
9969}
9970
9971/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9972/// underlying vector and index.
9973///
9974/// Modifies \p ExtractedFromVec to the real vector and returns the real
9975/// index.
9976static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9977 SDValue ExtIdx) {
9978 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9979 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9980 return Idx;
9981
9982 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9983 // lowered this:
9984 // (extract_vector_elt (v8f32 %1), Constant<6>)
9985 // to:
9986 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9987 // (extract_subvector (v8f32 %0), Constant<4>),
9988 // undef)
9989 // Constant<0>)
9990 // In this case the vector is the extract_subvector expression and the index
9991 // is 2, as specified by the shuffle.
9992 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9993 SDValue ShuffleVec = SVOp->getOperand(0);
9994 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9995 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__
__PRETTY_FUNCTION__))
9996 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__
__PRETTY_FUNCTION__))
;
9997
9998 int ShuffleIdx = SVOp->getMaskElt(Idx);
9999 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
10000 ExtractedFromVec = ShuffleVec;
10001 return ShuffleIdx;
10002 }
10003 return Idx;
10004}
10005
10006static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
10007 MVT VT = Op.getSimpleValueType();
10008
10009 // Skip if insert_vec_elt is not supported.
10010 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10011 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
10012 return SDValue();
10013
10014 SDLoc DL(Op);
10015 unsigned NumElems = Op.getNumOperands();
10016
10017 SDValue VecIn1;
10018 SDValue VecIn2;
10019 SmallVector<unsigned, 4> InsertIndices;
10020 SmallVector<int, 8> Mask(NumElems, -1);
10021
10022 for (unsigned i = 0; i != NumElems; ++i) {
10023 unsigned Opc = Op.getOperand(i).getOpcode();
10024
10025 if (Opc == ISD::UNDEF)
10026 continue;
10027
10028 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
10029 // Quit if more than 1 elements need inserting.
10030 if (InsertIndices.size() > 1)
10031 return SDValue();
10032
10033 InsertIndices.push_back(i);
10034 continue;
10035 }
10036
10037 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
10038 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
10039
10040 // Quit if non-constant index.
10041 if (!isa<ConstantSDNode>(ExtIdx))
10042 return SDValue();
10043 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
10044
10045 // Quit if extracted from vector of different type.
10046 if (ExtractedFromVec.getValueType() != VT)
10047 return SDValue();
10048
10049 if (!VecIn1.getNode())
10050 VecIn1 = ExtractedFromVec;
10051 else if (VecIn1 != ExtractedFromVec) {
10052 if (!VecIn2.getNode())
10053 VecIn2 = ExtractedFromVec;
10054 else if (VecIn2 != ExtractedFromVec)
10055 // Quit if more than 2 vectors to shuffle
10056 return SDValue();
10057 }
10058
10059 if (ExtractedFromVec == VecIn1)
10060 Mask[i] = Idx;
10061 else if (ExtractedFromVec == VecIn2)
10062 Mask[i] = Idx + NumElems;
10063 }
10064
10065 if (!VecIn1.getNode())
10066 return SDValue();
10067
10068 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
10069 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
10070
10071 for (unsigned Idx : InsertIndices)
10072 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
10073 DAG.getIntPtrConstant(Idx, DL));
10074
10075 return NV;
10076}
10077
10078// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
10079static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
10080 const X86Subtarget &Subtarget) {
10081 MVT VT = Op.getSimpleValueType();
10082 MVT IVT = VT.changeVectorElementTypeToInteger();
10083 SmallVector<SDValue, 16> NewOps;
10084 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
10085 NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
10086 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
10087 return DAG.getBitcast(VT, Res);
10088}
10089
10090// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
10091static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
10092 const X86Subtarget &Subtarget) {
10093
10094 MVT VT = Op.getSimpleValueType();
10095 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__
__PRETTY_FUNCTION__))
10096 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__
__PRETTY_FUNCTION__))
;
10097
10098 SDLoc dl(Op);
10099 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
10100 ISD::isBuildVectorAllOnes(Op.getNode()))
10101 return Op;
10102
10103 uint64_t Immediate = 0;
10104 SmallVector<unsigned, 16> NonConstIdx;
10105 bool IsSplat = true;
10106 bool HasConstElts = false;
10107 int SplatIdx = -1;
10108 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
10109 SDValue In = Op.getOperand(idx);
10110 if (In.isUndef())
10111 continue;
10112 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
10113 Immediate |= (InC->getZExtValue() & 0x1) << idx;
10114 HasConstElts = true;
10115 } else {
10116 NonConstIdx.push_back(idx);
10117 }
10118 if (SplatIdx < 0)
10119 SplatIdx = idx;
10120 else if (In != Op.getOperand(SplatIdx))
10121 IsSplat = false;
10122 }
10123
10124 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
10125 if (IsSplat) {
10126 // The build_vector allows the scalar element to be larger than the vector
10127 // element type. We need to mask it to use as a condition unless we know
10128 // the upper bits are zero.
10129 // FIXME: Use computeKnownBits instead of checking specific opcode?
10130 SDValue Cond = Op.getOperand(SplatIdx);
10131 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10131, __extension__
__PRETTY_FUNCTION__))
;
10132 if (Cond.getOpcode() != ISD::SETCC)
10133 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
10134 DAG.getConstant(1, dl, MVT::i8));
10135
10136 // Perform the select in the scalar domain so we can use cmov.
10137 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10138 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
10139 DAG.getAllOnesConstant(dl, MVT::i32),
10140 DAG.getConstant(0, dl, MVT::i32));
10141 Select = DAG.getBitcast(MVT::v32i1, Select);
10142 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
10143 } else {
10144 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10145 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
10146 DAG.getAllOnesConstant(dl, ImmVT),
10147 DAG.getConstant(0, dl, ImmVT));
10148 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10149 Select = DAG.getBitcast(VecVT, Select);
10150 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
10151 DAG.getIntPtrConstant(0, dl));
10152 }
10153 }
10154
10155 // insert elements one by one
10156 SDValue DstVec;
10157 if (HasConstElts) {
10158 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10159 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
10160 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
10161 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
10162 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
10163 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
10164 } else {
10165 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10166 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
10167 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10168 DstVec = DAG.getBitcast(VecVT, Imm);
10169 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
10170 DAG.getIntPtrConstant(0, dl));
10171 }
10172 } else
10173 DstVec = DAG.getUNDEF(VT);
10174
10175 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
10176 unsigned InsertIdx = NonConstIdx[i];
10177 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
10178 Op.getOperand(InsertIdx),
10179 DAG.getIntPtrConstant(InsertIdx, dl));
10180 }
10181 return DstVec;
10182}
10183
10184LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
10185 switch (Opcode) {
10186 case X86ISD::PACKSS:
10187 case X86ISD::PACKUS:
10188 case X86ISD::FHADD:
10189 case X86ISD::FHSUB:
10190 case X86ISD::HADD:
10191 case X86ISD::HSUB:
10192 return true;
10193 }
10194 return false;
10195}
10196
10197/// This is a helper function of LowerToHorizontalOp().
10198/// This function checks that the build_vector \p N in input implements a
10199/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
10200/// may not match the layout of an x86 256-bit horizontal instruction.
10201/// In other words, if this returns true, then some extraction/insertion will
10202/// be required to produce a valid horizontal instruction.
10203///
10204/// Parameter \p Opcode defines the kind of horizontal operation to match.
10205/// For example, if \p Opcode is equal to ISD::ADD, then this function
10206/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
10207/// is equal to ISD::SUB, then this function checks if this is a horizontal
10208/// arithmetic sub.
10209///
10210/// This function only analyzes elements of \p N whose indices are
10211/// in range [BaseIdx, LastIdx).
10212///
10213/// TODO: This function was originally used to match both real and fake partial
10214/// horizontal operations, but the index-matching logic is incorrect for that.
10215/// See the corrected implementation in isHopBuildVector(). Can we reduce this
10216/// code because it is only used for partial h-op matching now?
10217static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
10218 SelectionDAG &DAG,
10219 unsigned BaseIdx, unsigned LastIdx,
10220 SDValue &V0, SDValue &V1) {
10221 EVT VT = N->getValueType(0);
10222 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10222, __extension__
__PRETTY_FUNCTION__))
;
10223 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10223, __extension__
__PRETTY_FUNCTION__))
;
10224 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__
__PRETTY_FUNCTION__))
10225 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__
__PRETTY_FUNCTION__))
;
10226
10227 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
10228 bool CanFold = true;
10229 unsigned ExpectedVExtractIdx = BaseIdx;
10230 unsigned NumElts = LastIdx - BaseIdx;
10231 V0 = DAG.getUNDEF(VT);
10232 V1 = DAG.getUNDEF(VT);
10233
10234 // Check if N implements a horizontal binop.
10235 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
10236 SDValue Op = N->getOperand(i + BaseIdx);
10237
10238 // Skip UNDEFs.
10239 if (Op->isUndef()) {
10240 // Update the expected vector extract index.
10241 if (i * 2 == NumElts)
10242 ExpectedVExtractIdx = BaseIdx;
10243 ExpectedVExtractIdx += 2;
10244 continue;
10245 }
10246
10247 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
10248
10249 if (!CanFold)
10250 break;
10251
10252 SDValue Op0 = Op.getOperand(0);
10253 SDValue Op1 = Op.getOperand(1);
10254
10255 // Try to match the following pattern:
10256 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
10257 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10258 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10259 Op0.getOperand(0) == Op1.getOperand(0) &&
10260 isa<ConstantSDNode>(Op0.getOperand(1)) &&
10261 isa<ConstantSDNode>(Op1.getOperand(1)));
10262 if (!CanFold)
10263 break;
10264
10265 unsigned I0 = Op0.getConstantOperandVal(1);
10266 unsigned I1 = Op1.getConstantOperandVal(1);
10267
10268 if (i * 2 < NumElts) {
10269 if (V0.isUndef()) {
10270 V0 = Op0.getOperand(0);
10271 if (V0.getValueType() != VT)
10272 return false;
10273 }
10274 } else {
10275 if (V1.isUndef()) {
10276 V1 = Op0.getOperand(0);
10277 if (V1.getValueType() != VT)
10278 return false;
10279 }
10280 if (i * 2 == NumElts)
10281 ExpectedVExtractIdx = BaseIdx;
10282 }
10283
10284 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
10285 if (I0 == ExpectedVExtractIdx)
10286 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
10287 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
10288 // Try to match the following dag sequence:
10289 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
10290 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
10291 } else
10292 CanFold = false;
10293
10294 ExpectedVExtractIdx += 2;
10295 }
10296
10297 return CanFold;
10298}
10299
10300/// Emit a sequence of two 128-bit horizontal add/sub followed by
10301/// a concat_vector.
10302///
10303/// This is a helper function of LowerToHorizontalOp().
10304/// This function expects two 256-bit vectors called V0 and V1.
10305/// At first, each vector is split into two separate 128-bit vectors.
10306/// Then, the resulting 128-bit vectors are used to implement two
10307/// horizontal binary operations.
10308///
10309/// The kind of horizontal binary operation is defined by \p X86Opcode.
10310///
10311/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
10312/// the two new horizontal binop.
10313/// When Mode is set, the first horizontal binop dag node would take as input
10314/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
10315/// horizontal binop dag node would take as input the lower 128-bit of V1
10316/// and the upper 128-bit of V1.
10317/// Example:
10318/// HADD V0_LO, V0_HI
10319/// HADD V1_LO, V1_HI
10320///
10321/// Otherwise, the first horizontal binop dag node takes as input the lower
10322/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
10323/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
10324/// Example:
10325/// HADD V0_LO, V1_LO
10326/// HADD V0_HI, V1_HI
10327///
10328/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
10329/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
10330/// the upper 128-bits of the result.
10331static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
10332 const SDLoc &DL, SelectionDAG &DAG,
10333 unsigned X86Opcode, bool Mode,
10334 bool isUndefLO, bool isUndefHI) {
10335 MVT VT = V0.getSimpleValueType();
10336 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__
__PRETTY_FUNCTION__))
10337 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__
__PRETTY_FUNCTION__))
;
10338
10339 unsigned NumElts = VT.getVectorNumElements();
10340 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
10341 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
10342 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
10343 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
10344 MVT NewVT = V0_LO.getSimpleValueType();
10345
10346 SDValue LO = DAG.getUNDEF(NewVT);
10347 SDValue HI = DAG.getUNDEF(NewVT);
10348
10349 if (Mode) {
10350 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10351 if (!isUndefLO && !V0->isUndef())
10352 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
10353 if (!isUndefHI && !V1->isUndef())
10354 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
10355 } else {
10356 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10357 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
10358 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
10359
10360 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
10361 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
10362 }
10363
10364 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
10365}
10366
10367/// Returns true iff \p BV builds a vector with the result equivalent to
10368/// the result of ADDSUB/SUBADD operation.
10369/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
10370/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
10371/// \p Opnd0 and \p Opnd1.
10372static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
10373 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10374 SDValue &Opnd0, SDValue &Opnd1,
10375 unsigned &NumExtracts,
10376 bool &IsSubAdd) {
10377
10378 MVT VT = BV->getSimpleValueType(0);
10379 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
10380 return false;
10381
10382 unsigned NumElts = VT.getVectorNumElements();
10383 SDValue InVec0 = DAG.getUNDEF(VT);
10384 SDValue InVec1 = DAG.getUNDEF(VT);
10385
10386 NumExtracts = 0;
10387
10388 // Odd-numbered elements in the input build vector are obtained from
10389 // adding/subtracting two integer/float elements.
10390 // Even-numbered elements in the input build vector are obtained from
10391 // subtracting/adding two integer/float elements.
10392 unsigned Opc[2] = {0, 0};
10393 for (unsigned i = 0, e = NumElts; i != e; ++i) {
10394 SDValue Op = BV->getOperand(i);
10395
10396 // Skip 'undef' values.
10397 unsigned Opcode = Op.getOpcode();
10398 if (Opcode == ISD::UNDEF)
10399 continue;
10400
10401 // Early exit if we found an unexpected opcode.
10402 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
10403 return false;
10404
10405 SDValue Op0 = Op.getOperand(0);
10406 SDValue Op1 = Op.getOperand(1);
10407
10408 // Try to match the following pattern:
10409 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10410 // Early exit if we cannot match that sequence.
10411 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10412 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10413 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10414 Op0.getOperand(1) != Op1.getOperand(1))
10415 return false;
10416
10417 unsigned I0 = Op0.getConstantOperandVal(1);
10418 if (I0 != i)
10419 return false;
10420
10421 // We found a valid add/sub node, make sure its the same opcode as previous
10422 // elements for this parity.
10423 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10424 return false;
10425 Opc[i % 2] = Opcode;
10426
10427 // Update InVec0 and InVec1.
10428 if (InVec0.isUndef()) {
10429 InVec0 = Op0.getOperand(0);
10430 if (InVec0.getSimpleValueType() != VT)
10431 return false;
10432 }
10433 if (InVec1.isUndef()) {
10434 InVec1 = Op1.getOperand(0);
10435 if (InVec1.getSimpleValueType() != VT)
10436 return false;
10437 }
10438
10439 // Make sure that operands in input to each add/sub node always
10440 // come from a same pair of vectors.
10441 if (InVec0 != Op0.getOperand(0)) {
10442 if (Opcode == ISD::FSUB)
10443 return false;
10444
10445 // FADD is commutable. Try to commute the operands
10446 // and then test again.
10447 std::swap(Op0, Op1);
10448 if (InVec0 != Op0.getOperand(0))
10449 return false;
10450 }
10451
10452 if (InVec1 != Op1.getOperand(0))
10453 return false;
10454
10455 // Increment the number of extractions done.
10456 ++NumExtracts;
10457 }
10458
10459 // Ensure we have found an opcode for both parities and that they are
10460 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10461 // inputs are undef.
10462 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10463 InVec0.isUndef() || InVec1.isUndef())
10464 return false;
10465
10466 IsSubAdd = Opc[0] == ISD::FADD;
10467
10468 Opnd0 = InVec0;
10469 Opnd1 = InVec1;
10470 return true;
10471}
10472
10473/// Returns true if is possible to fold MUL and an idiom that has already been
10474/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10475/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10476/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10477///
10478/// Prior to calling this function it should be known that there is some
10479/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10480/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10481/// before replacement of such SDNode with ADDSUB operation. Thus the number
10482/// of \p Opnd0 uses is expected to be equal to 2.
10483/// For example, this function may be called for the following IR:
10484/// %AB = fmul fast <2 x double> %A, %B
10485/// %Sub = fsub fast <2 x double> %AB, %C
10486/// %Add = fadd fast <2 x double> %AB, %C
10487/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10488/// <2 x i32> <i32 0, i32 3>
10489/// There is a def for %Addsub here, which potentially can be replaced by
10490/// X86ISD::ADDSUB operation:
10491/// %Addsub = X86ISD::ADDSUB %AB, %C
10492/// and such ADDSUB can further be replaced with FMADDSUB:
10493/// %Addsub = FMADDSUB %A, %B, %C.
10494///
10495/// The main reason why this method is called before the replacement of the
10496/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10497/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10498/// FMADDSUB is.
10499static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10500 SelectionDAG &DAG,
10501 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10502 unsigned ExpectedUses) {
10503 if (Opnd0.getOpcode() != ISD::FMUL ||
10504 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10505 return false;
10506
10507 // FIXME: These checks must match the similar ones in
10508 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10509 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10510 // or MUL + ADDSUB to FMADDSUB.
10511 const TargetOptions &Options = DAG.getTarget().Options;
10512 bool AllowFusion =
10513 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10514 if (!AllowFusion)
10515 return false;
10516
10517 Opnd2 = Opnd1;
10518 Opnd1 = Opnd0.getOperand(1);
10519 Opnd0 = Opnd0.getOperand(0);
10520
10521 return true;
10522}
10523
10524/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10525/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10526/// X86ISD::FMSUBADD node.
10527static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10528 const X86Subtarget &Subtarget,
10529 SelectionDAG &DAG) {
10530 SDValue Opnd0, Opnd1;
10531 unsigned NumExtracts;
10532 bool IsSubAdd;
10533 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10534 IsSubAdd))
10535 return SDValue();
10536
10537 MVT VT = BV->getSimpleValueType(0);
10538 SDLoc DL(BV);
10539
10540 // Try to generate X86ISD::FMADDSUB node here.
10541 SDValue Opnd2;
10542 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10543 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10544 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10545 }
10546
10547 // We only support ADDSUB.
10548 if (IsSubAdd)
10549 return SDValue();
10550
10551 // There are no known X86 targets with 512-bit ADDSUB instructions!
10552 // Convert to blend(fsub,fadd).
10553 if (VT.is512BitVector()) {
10554 SmallVector<int> Mask;
10555 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10556 Mask.push_back(I);
10557 Mask.push_back(I + E + 1);
10558 }
10559 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10560 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10561 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10562 }
10563
10564 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10565}
10566
10567static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10568 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10569 // Initialize outputs to known values.
10570 MVT VT = BV->getSimpleValueType(0);
10571 HOpcode = ISD::DELETED_NODE;
10572 V0 = DAG.getUNDEF(VT);
10573 V1 = DAG.getUNDEF(VT);
10574
10575 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10576 // half of the result is calculated independently from the 128-bit halves of
10577 // the inputs, so that makes the index-checking logic below more complicated.
10578 unsigned NumElts = VT.getVectorNumElements();
10579 unsigned GenericOpcode = ISD::DELETED_NODE;
10580 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10581 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10582 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10583 for (unsigned i = 0; i != Num128BitChunks; ++i) {
10584 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10585 // Ignore undef elements.
10586 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10587 if (Op.isUndef())
10588 continue;
10589
10590 // If there's an opcode mismatch, we're done.
10591 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10592 return false;
10593
10594 // Initialize horizontal opcode.
10595 if (HOpcode == ISD::DELETED_NODE) {
10596 GenericOpcode = Op.getOpcode();
10597 switch (GenericOpcode) {
10598 case ISD::ADD: HOpcode = X86ISD::HADD; break;
10599 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10600 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10601 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10602 default: return false;
10603 }
10604 }
10605
10606 SDValue Op0 = Op.getOperand(0);
10607 SDValue Op1 = Op.getOperand(1);
10608 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10609 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10610 Op0.getOperand(0) != Op1.getOperand(0) ||
10611 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10612 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10613 return false;
10614
10615 // The source vector is chosen based on which 64-bit half of the
10616 // destination vector is being calculated.
10617 if (j < NumEltsIn64Bits) {
10618 if (V0.isUndef())
10619 V0 = Op0.getOperand(0);
10620 } else {
10621 if (V1.isUndef())
10622 V1 = Op0.getOperand(0);
10623 }
10624
10625 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10626 if (SourceVec != Op0.getOperand(0))
10627 return false;
10628
10629 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10630 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10631 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10632 unsigned ExpectedIndex = i * NumEltsIn128Bits +
10633 (j % NumEltsIn64Bits) * 2;
10634 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10635 continue;
10636
10637 // If this is not a commutative op, this does not match.
10638 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10639 return false;
10640
10641 // Addition is commutative, so try swapping the extract indexes.
10642 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10643 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10644 continue;
10645
10646 // Extract indexes do not match horizontal requirement.
10647 return false;
10648 }
10649 }
10650 // We matched. Opcode and operands are returned by reference as arguments.
10651 return true;
10652}
10653
10654static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10655 SelectionDAG &DAG, unsigned HOpcode,
10656 SDValue V0, SDValue V1) {
10657 // If either input vector is not the same size as the build vector,
10658 // extract/insert the low bits to the correct size.
10659 // This is free (examples: zmm --> xmm, xmm --> ymm).
10660 MVT VT = BV->getSimpleValueType(0);
10661 unsigned Width = VT.getSizeInBits();
10662 if (V0.getValueSizeInBits() > Width)
10663 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10664 else if (V0.getValueSizeInBits() < Width)
10665 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10666
10667 if (V1.getValueSizeInBits() > Width)
10668 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10669 else if (V1.getValueSizeInBits() < Width)
10670 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10671
10672 unsigned NumElts = VT.getVectorNumElements();
10673 APInt DemandedElts = APInt::getAllOnes(NumElts);
10674 for (unsigned i = 0; i != NumElts; ++i)
10675 if (BV->getOperand(i).isUndef())
10676 DemandedElts.clearBit(i);
10677
10678 // If we don't need the upper xmm, then perform as a xmm hop.
10679 unsigned HalfNumElts = NumElts / 2;
10680 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10681 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10682 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10683 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10684 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10685 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10686 }
10687
10688 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10689}
10690
10691/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10692static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10693 const X86Subtarget &Subtarget,
10694 SelectionDAG &DAG) {
10695 // We need at least 2 non-undef elements to make this worthwhile by default.
10696 unsigned NumNonUndefs =
10697 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10698 if (NumNonUndefs < 2)
10699 return SDValue();
10700
10701 // There are 4 sets of horizontal math operations distinguished by type:
10702 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10703 // subtarget feature. Try to match those "native" patterns first.
10704 MVT VT = BV->getSimpleValueType(0);
10705 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10706 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10707 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10708 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10709 unsigned HOpcode;
10710 SDValue V0, V1;
10711 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10712 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10713 }
10714
10715 // Try harder to match 256-bit ops by using extract/concat.
10716 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10717 return SDValue();
10718
10719 // Count the number of UNDEF operands in the build_vector in input.
10720 unsigned NumElts = VT.getVectorNumElements();
10721 unsigned Half = NumElts / 2;
10722 unsigned NumUndefsLO = 0;
10723 unsigned NumUndefsHI = 0;
10724 for (unsigned i = 0, e = Half; i != e; ++i)
10725 if (BV->getOperand(i)->isUndef())
10726 NumUndefsLO++;
10727
10728 for (unsigned i = Half, e = NumElts; i != e; ++i)
10729 if (BV->getOperand(i)->isUndef())
10730 NumUndefsHI++;
10731
10732 SDLoc DL(BV);
10733 SDValue InVec0, InVec1;
10734 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10735 SDValue InVec2, InVec3;
10736 unsigned X86Opcode;
10737 bool CanFold = true;
10738
10739 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10740 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10741 InVec3) &&
10742 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10743 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10744 X86Opcode = X86ISD::HADD;
10745 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10746 InVec1) &&
10747 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10748 InVec3) &&
10749 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10750 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10751 X86Opcode = X86ISD::HSUB;
10752 else
10753 CanFold = false;
10754
10755 if (CanFold) {
10756 // Do not try to expand this build_vector into a pair of horizontal
10757 // add/sub if we can emit a pair of scalar add/sub.
10758 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10759 return SDValue();
10760
10761 // Convert this build_vector into a pair of horizontal binops followed by
10762 // a concat vector. We must adjust the outputs from the partial horizontal
10763 // matching calls above to account for undefined vector halves.
10764 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10765 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10766 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10766, __extension__
__PRETTY_FUNCTION__))
;
10767 bool isUndefLO = NumUndefsLO == Half;
10768 bool isUndefHI = NumUndefsHI == Half;
10769 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10770 isUndefHI);
10771 }
10772 }
10773
10774 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10775 VT == MVT::v16i16) {
10776 unsigned X86Opcode;
10777 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10778 X86Opcode = X86ISD::HADD;
10779 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10780 InVec1))
10781 X86Opcode = X86ISD::HSUB;
10782 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10783 InVec1))
10784 X86Opcode = X86ISD::FHADD;
10785 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10786 InVec1))
10787 X86Opcode = X86ISD::FHSUB;
10788 else
10789 return SDValue();
10790
10791 // Don't try to expand this build_vector into a pair of horizontal add/sub
10792 // if we can simply emit a pair of scalar add/sub.
10793 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10794 return SDValue();
10795
10796 // Convert this build_vector into two horizontal add/sub followed by
10797 // a concat vector.
10798 bool isUndefLO = NumUndefsLO == Half;
10799 bool isUndefHI = NumUndefsHI == Half;
10800 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10801 isUndefLO, isUndefHI);
10802 }
10803
10804 return SDValue();
10805}
10806
10807static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10808 SelectionDAG &DAG);
10809
10810/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10811/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10812/// just apply the bit to the vectors.
10813/// NOTE: Its not in our interest to start make a general purpose vectorizer
10814/// from this, but enough scalar bit operations are created from the later
10815/// legalization + scalarization stages to need basic support.
10816static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10817 const X86Subtarget &Subtarget,
10818 SelectionDAG &DAG) {
10819 SDLoc DL(Op);
10820 MVT VT = Op->getSimpleValueType(0);
10821 unsigned NumElems = VT.getVectorNumElements();
10822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10823
10824 // Check that all elements have the same opcode.
10825 // TODO: Should we allow UNDEFS and if so how many?
10826 unsigned Opcode = Op->getOperand(0).getOpcode();
10827 for (unsigned i = 1; i < NumElems; ++i)
10828 if (Opcode != Op->getOperand(i).getOpcode())
10829 return SDValue();
10830
10831 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10832 bool IsShift = false;
10833 switch (Opcode) {
10834 default:
10835 return SDValue();
10836 case ISD::SHL:
10837 case ISD::SRL:
10838 case ISD::SRA:
10839 IsShift = true;
10840 break;
10841 case ISD::AND:
10842 case ISD::XOR:
10843 case ISD::OR:
10844 // Don't do this if the buildvector is a splat - we'd replace one
10845 // constant with an entire vector.
10846 if (Op->getSplatValue())
10847 return SDValue();
10848 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10849 return SDValue();
10850 break;
10851 }
10852
10853 SmallVector<SDValue, 4> LHSElts, RHSElts;
10854 for (SDValue Elt : Op->ops()) {
10855 SDValue LHS = Elt.getOperand(0);
10856 SDValue RHS = Elt.getOperand(1);
10857
10858 // We expect the canonicalized RHS operand to be the constant.
10859 if (!isa<ConstantSDNode>(RHS))
10860 return SDValue();
10861
10862 // Extend shift amounts.
10863 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10864 if (!IsShift)
10865 return SDValue();
10866 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10867 }
10868
10869 LHSElts.push_back(LHS);
10870 RHSElts.push_back(RHS);
10871 }
10872
10873 // Limit to shifts by uniform immediates.
10874 // TODO: Only accept vXi8/vXi64 special cases?
10875 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10876 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10877 return SDValue();
10878
10879 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10880 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10881 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10882
10883 if (!IsShift)
10884 return Res;
10885
10886 // Immediately lower the shift to ensure the constant build vector doesn't
10887 // get converted to a constant pool before the shift is lowered.
10888 return LowerShift(Res, Subtarget, DAG);
10889}
10890
10891/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10892/// functionality to do this, so it's all zeros, all ones, or some derivation
10893/// that is cheap to calculate.
10894static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10895 const X86Subtarget &Subtarget) {
10896 SDLoc DL(Op);
10897 MVT VT = Op.getSimpleValueType();
10898
10899 // Vectors containing all zeros can be matched by pxor and xorps.
10900 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10901 return Op;
10902
10903 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10904 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10905 // vpcmpeqd on 256-bit vectors.
10906 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10907 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10908 return Op;
10909
10910 return getOnesVector(VT, DAG, DL);
10911 }
10912
10913 return SDValue();
10914}
10915
10916/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10917/// from a vector of source values and a vector of extraction indices.
10918/// The vectors might be manipulated to match the type of the permute op.
10919static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10920 SDLoc &DL, SelectionDAG &DAG,
10921 const X86Subtarget &Subtarget) {
10922 MVT ShuffleVT = VT;
10923 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10924 unsigned NumElts = VT.getVectorNumElements();
10925 unsigned SizeInBits = VT.getSizeInBits();
10926
10927 // Adjust IndicesVec to match VT size.
10928 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__
__PRETTY_FUNCTION__))
10929 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__
__PRETTY_FUNCTION__))
;
10930 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10931 // Narrow/widen the indices vector to the correct size.
10932 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10933 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10934 NumElts * VT.getScalarSizeInBits());
10935 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10936 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10937 SDLoc(IndicesVec), SizeInBits);
10938 // Zero-extend the index elements within the vector.
10939 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10940 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10941 IndicesVT, IndicesVec);
10942 }
10943 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10944
10945 // Handle SrcVec that don't match VT type.
10946 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10947 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10948 // Handle larger SrcVec by treating it as a larger permute.
10949 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10950 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10951 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10952 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10953 Subtarget, DAG, SDLoc(IndicesVec));
10954 SDValue NewSrcVec =
10955 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10956 if (NewSrcVec)
10957 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10958 return SDValue();
10959 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10960 // Widen smaller SrcVec to match VT.
10961 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10962 } else
10963 return SDValue();
10964 }
10965
10966 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10967 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10967, __extension__
__PRETTY_FUNCTION__))
;
10968 EVT SrcVT = Idx.getValueType();
10969 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10970 uint64_t IndexScale = 0;
10971 uint64_t IndexOffset = 0;
10972
10973 // If we're scaling a smaller permute op, then we need to repeat the
10974 // indices, scaling and offsetting them as well.
10975 // e.g. v4i32 -> v16i8 (Scale = 4)
10976 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10977 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10978 for (uint64_t i = 0; i != Scale; ++i) {
10979 IndexScale |= Scale << (i * NumDstBits);
10980 IndexOffset |= i << (i * NumDstBits);
10981 }
10982
10983 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10984 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10985 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10986 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10987 return Idx;
10988 };
10989
10990 unsigned Opcode = 0;
10991 switch (VT.SimpleTy) {
10992 default:
10993 break;
10994 case MVT::v16i8:
10995 if (Subtarget.hasSSSE3())
10996 Opcode = X86ISD::PSHUFB;
10997 break;
10998 case MVT::v8i16:
10999 if (Subtarget.hasVLX() && Subtarget.hasBWI())
11000 Opcode = X86ISD::VPERMV;
11001 else if (Subtarget.hasSSSE3()) {
11002 Opcode = X86ISD::PSHUFB;
11003 ShuffleVT = MVT::v16i8;
11004 }
11005 break;
11006 case MVT::v4f32:
11007 case MVT::v4i32:
11008 if (Subtarget.hasAVX()) {
11009 Opcode = X86ISD::VPERMILPV;
11010 ShuffleVT = MVT::v4f32;
11011 } else if (Subtarget.hasSSSE3()) {
11012 Opcode = X86ISD::PSHUFB;
11013 ShuffleVT = MVT::v16i8;
11014 }
11015 break;
11016 case MVT::v2f64:
11017 case MVT::v2i64:
11018 if (Subtarget.hasAVX()) {
11019 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
11020 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11021 Opcode = X86ISD::VPERMILPV;
11022 ShuffleVT = MVT::v2f64;
11023 } else if (Subtarget.hasSSE41()) {
11024 // SSE41 can compare v2i64 - select between indices 0 and 1.
11025 return DAG.getSelectCC(
11026 DL, IndicesVec,
11027 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
11028 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
11029 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
11030 ISD::CondCode::SETEQ);
11031 }
11032 break;
11033 case MVT::v32i8:
11034 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
11035 Opcode = X86ISD::VPERMV;
11036 else if (Subtarget.hasXOP()) {
11037 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
11038 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
11039 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
11040 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
11041 return DAG.getNode(
11042 ISD::CONCAT_VECTORS, DL, VT,
11043 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
11044 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
11045 } else if (Subtarget.hasAVX()) {
11046 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
11047 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
11048 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
11049 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
11050 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
11051 ArrayRef<SDValue> Ops) {
11052 // Permute Lo and Hi and then select based on index range.
11053 // This works as SHUFB uses bits[3:0] to permute elements and we don't
11054 // care about the bit[7] as its just an index vector.
11055 SDValue Idx = Ops[2];
11056 EVT VT = Idx.getValueType();
11057 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
11058 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
11059 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
11060 ISD::CondCode::SETGT);
11061 };
11062 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
11063 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
11064 PSHUFBBuilder);
11065 }
11066 break;
11067 case MVT::v16i16:
11068 if (Subtarget.hasVLX() && Subtarget.hasBWI())
11069 Opcode = X86ISD::VPERMV;
11070 else if (Subtarget.hasAVX()) {
11071 // Scale to v32i8 and perform as v32i8.
11072 IndicesVec = ScaleIndices(IndicesVec, 2);
11073 return DAG.getBitcast(
11074 VT, createVariablePermute(
11075 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
11076 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
11077 }
11078 break;
11079 case MVT::v8f32:
11080 case MVT::v8i32:
11081 if (Subtarget.hasAVX2())
11082 Opcode = X86ISD::VPERMV;
11083 else if (Subtarget.hasAVX()) {
11084 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
11085 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11086 {0, 1, 2, 3, 0, 1, 2, 3});
11087 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11088 {4, 5, 6, 7, 4, 5, 6, 7});
11089 if (Subtarget.hasXOP())
11090 return DAG.getBitcast(
11091 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
11092 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11093 // Permute Lo and Hi and then select based on index range.
11094 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
11095 SDValue Res = DAG.getSelectCC(
11096 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
11097 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
11098 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
11099 ISD::CondCode::SETGT);
11100 return DAG.getBitcast(VT, Res);
11101 }
11102 break;
11103 case MVT::v4i64:
11104 case MVT::v4f64:
11105 if (Subtarget.hasAVX512()) {
11106 if (!Subtarget.hasVLX()) {
11107 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
11108 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
11109 SDLoc(SrcVec));
11110 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
11111 DAG, SDLoc(IndicesVec));
11112 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
11113 DAG, Subtarget);
11114 return extract256BitVector(Res, 0, DAG, DL);
11115 }
11116 Opcode = X86ISD::VPERMV;
11117 } else if (Subtarget.hasAVX()) {
11118 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
11119 SDValue LoLo =
11120 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
11121 SDValue HiHi =
11122 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
11123 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
11124 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11125 if (Subtarget.hasXOP())
11126 return DAG.getBitcast(
11127 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
11128 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11129 // Permute Lo and Hi and then select based on index range.
11130 // This works as VPERMILPD only uses index bit[1] to permute elements.
11131 SDValue Res = DAG.getSelectCC(
11132 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
11133 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
11134 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
11135 ISD::CondCode::SETGT);
11136 return DAG.getBitcast(VT, Res);
11137 }
11138 break;
11139 case MVT::v64i8:
11140 if (Subtarget.hasVBMI())
11141 Opcode = X86ISD::VPERMV;
11142 break;
11143 case MVT::v32i16:
11144 if (Subtarget.hasBWI())
11145 Opcode = X86ISD::VPERMV;
11146 break;
11147 case MVT::v16f32:
11148 case MVT::v16i32:
11149 case MVT::v8f64:
11150 case MVT::v8i64:
11151 if (Subtarget.hasAVX512())
11152 Opcode = X86ISD::VPERMV;
11153 break;
11154 }
11155 if (!Opcode)
11156 return SDValue();
11157
11158 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))
11159 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))
11160 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))
;
11161
11162 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
11163 if (Scale > 1)
11164 IndicesVec = ScaleIndices(IndicesVec, Scale);
11165
11166 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
11167 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
11168
11169 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
11170 SDValue Res = Opcode == X86ISD::VPERMV
11171 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
11172 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
11173 return DAG.getBitcast(VT, Res);
11174}
11175
11176// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
11177// reasoned to be a permutation of a vector by indices in a non-constant vector.
11178// (build_vector (extract_elt V, (extract_elt I, 0)),
11179// (extract_elt V, (extract_elt I, 1)),
11180// ...
11181// ->
11182// (vpermv I, V)
11183//
11184// TODO: Handle undefs
11185// TODO: Utilize pshufb and zero mask blending to support more efficient
11186// construction of vectors with constant-0 elements.
11187static SDValue
11188LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
11189 const X86Subtarget &Subtarget) {
11190 SDValue SrcVec, IndicesVec;
11191 // Check for a match of the permute source vector and permute index elements.
11192 // This is done by checking that the i-th build_vector operand is of the form:
11193 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
11194 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
11195 SDValue Op = V.getOperand(Idx);
11196 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11197 return SDValue();
11198
11199 // If this is the first extract encountered in V, set the source vector,
11200 // otherwise verify the extract is from the previously defined source
11201 // vector.
11202 if (!SrcVec)
11203 SrcVec = Op.getOperand(0);
11204 else if (SrcVec != Op.getOperand(0))
11205 return SDValue();
11206 SDValue ExtractedIndex = Op->getOperand(1);
11207 // Peek through extends.
11208 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
11209 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
11210 ExtractedIndex = ExtractedIndex.getOperand(0);
11211 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11212 return SDValue();
11213
11214 // If this is the first extract from the index vector candidate, set the
11215 // indices vector, otherwise verify the extract is from the previously
11216 // defined indices vector.
11217 if (!IndicesVec)
11218 IndicesVec = ExtractedIndex.getOperand(0);
11219 else if (IndicesVec != ExtractedIndex.getOperand(0))
11220 return SDValue();
11221
11222 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
11223 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
11224 return SDValue();
11225 }
11226
11227 SDLoc DL(V);
11228 MVT VT = V.getSimpleValueType();
11229 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
11230}
11231
11232SDValue
11233X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
11234 SDLoc dl(Op);
11235
11236 MVT VT = Op.getSimpleValueType();
11237 MVT EltVT = VT.getVectorElementType();
11238 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
11239 unsigned NumElems = Op.getNumOperands();
11240
11241 // Generate vectors for predicate vectors.
11242 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
11243 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
11244
11245 if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
11246 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
11247
11248 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
11249 return VectorConstant;
11250
11251 unsigned EVTBits = EltVT.getSizeInBits();
11252 APInt UndefMask = APInt::getZero(NumElems);
11253 APInt FrozenUndefMask = APInt::getZero(NumElems);
11254 APInt ZeroMask = APInt::getZero(NumElems);
11255 APInt NonZeroMask = APInt::getZero(NumElems);
11256 bool IsAllConstants = true;
11257 bool OneUseFrozenUndefs = true;
11258 SmallSet<SDValue, 8> Values;
11259 unsigned NumConstants = NumElems;
11260 for (unsigned i = 0; i < NumElems; ++i) {
11261 SDValue Elt = Op.getOperand(i);
11262 if (Elt.isUndef()) {
11263 UndefMask.setBit(i);
11264 continue;
11265 }
11266 if (ISD::isFreezeUndef(Elt.getNode())) {
11267 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
11268 FrozenUndefMask.setBit(i);
11269 continue;
11270 }
11271 Values.insert(Elt);
11272 if (!isIntOrFPConstant(Elt)) {
11273 IsAllConstants = false;
11274 NumConstants--;
11275 }
11276 if (X86::isZeroNode(Elt)) {
11277 ZeroMask.setBit(i);
11278 } else {
11279 NonZeroMask.setBit(i);
11280 }
11281 }
11282
11283 // All undef vector. Return an UNDEF.
11284 if (UndefMask.isAllOnes())
11285 return DAG.getUNDEF(VT);
11286
11287 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
11288 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
11289 return DAG.getFreeze(DAG.getUNDEF(VT));
11290
11291 // All undef/freeze(undef)/zero vector. Return a zero vector.
11292 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
11293 return getZeroVector(VT, Subtarget, DAG, dl);
11294
11295 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
11296 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
11297 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
11298 // and blend the FREEZE-UNDEF operands back in.
11299 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
11300 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
11301 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
11302 SmallVector<int, 16> BlendMask(NumElems, -1);
11303 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
11304 for (unsigned i = 0; i < NumElems; ++i) {
11305 if (UndefMask[i]) {
11306 BlendMask[i] = -1;
11307 continue;
11308 }
11309 BlendMask[i] = i;
11310 if (!FrozenUndefMask[i])
11311 Elts[i] = Op.getOperand(i);
11312 else
11313 BlendMask[i] += NumElems;
11314 }
11315 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
11316 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
11317 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
11318 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
11319 }
11320
11321 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
11322
11323 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
11324 // be better off lowering to a smaller build vector and padding with
11325 // undef/zero.
11326 if ((VT.is256BitVector() || VT.is512BitVector()) &&
11327 !isFoldableUseOfShuffle(BV)) {
11328 unsigned UpperElems = NumElems / 2;
11329 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
11330 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
11331 if (NumUpperUndefsOrZeros >= UpperElems) {
11332 if (VT.is512BitVector() &&
11333 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
11334 UpperElems = NumElems - (NumElems / 4);
11335 // If freeze(undef) is in any upper elements, force to zero.
11336 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
11337 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
11338 SDValue NewBV =
11339 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
11340 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
11341 }
11342 }
11343
11344 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
11345 return AddSub;
11346 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
11347 return HorizontalOp;
11348 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
11349 return Broadcast;
11350 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
11351 return BitOp;
11352
11353 unsigned NumZero = ZeroMask.popcount();
11354 unsigned NumNonZero = NonZeroMask.popcount();
11355
11356 // If we are inserting one variable into a vector of non-zero constants, try
11357 // to avoid loading each constant element as a scalar. Load the constants as a
11358 // vector and then insert the variable scalar element. If insertion is not
11359 // supported, fall back to a shuffle to get the scalar blended with the
11360 // constants. Insertion into a zero vector is handled as a special-case
11361 // somewhere below here.
11362 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
11363 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
11364 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
11365 // Create an all-constant vector. The variable element in the old
11366 // build vector is replaced by undef in the constant vector. Save the
11367 // variable scalar element and its index for use in the insertelement.
11368 LLVMContext &Context = *DAG.getContext();
11369 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
11370 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
11371 SDValue VarElt;
11372 SDValue InsIndex;
11373 for (unsigned i = 0; i != NumElems; ++i) {
11374 SDValue Elt = Op.getOperand(i);
11375 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
11376 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
11377 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
11378 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
11379 else if (!Elt.isUndef()) {
11380 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__
__PRETTY_FUNCTION__))
11381 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__
__PRETTY_FUNCTION__))
;
11382 VarElt = Elt;
11383 InsIndex = DAG.getVectorIdxConstant(i, dl);
11384 }
11385 }
11386 Constant *CV = ConstantVector::get(ConstVecOps);
11387 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
11388
11389 // The constants we just created may not be legal (eg, floating point). We
11390 // must lower the vector right here because we can not guarantee that we'll
11391 // legalize it before loading it. This is also why we could not just create
11392 // a new build vector here. If the build vector contains illegal constants,
11393 // it could get split back up into a series of insert elements.
11394 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
11395 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
11396 MachineFunction &MF = DAG.getMachineFunction();
11397 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
11398 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
11399 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
11400 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
11401 if (InsertC < NumEltsInLow128Bits)
11402 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
11403
11404 // There's no good way to insert into the high elements of a >128-bit
11405 // vector, so use shuffles to avoid an extract/insert sequence.
11406 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11406, __extension__
__PRETTY_FUNCTION__))
;
11407 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11407, __extension__
__PRETTY_FUNCTION__))
;
11408 SmallVector<int, 8> ShuffleMask;
11409 unsigned NumElts = VT.getVectorNumElements();
11410 for (unsigned i = 0; i != NumElts; ++i)
11411 ShuffleMask.push_back(i == InsertC ? NumElts : i);
11412 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
11413 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
11414 }
11415
11416 // Special case for single non-zero, non-undef, element.
11417 if (NumNonZero == 1) {
11418 unsigned Idx = NonZeroMask.countr_zero();
11419 SDValue Item = Op.getOperand(Idx);
11420
11421 // If we have a constant or non-constant insertion into the low element of
11422 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
11423 // the rest of the elements. This will be matched as movd/movq/movss/movsd
11424 // depending on what the source datatype is.
11425 if (Idx == 0) {
11426 if (NumZero == 0)
11427 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11428
11429 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
11430 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
11431 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
11432 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))
11433 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))
11434 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))
;
11435 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11436 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
11437 // zero vector.
11438 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11439 }
11440
11441 // We can't directly insert an i8 or i16 into a vector, so zero extend
11442 // it to i32 first.
11443 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
11444 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
11445 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
11446 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
11447 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11448 return DAG.getBitcast(VT, Item);
11449 }
11450 }
11451
11452 // Is it a vector logical left shift?
11453 if (NumElems == 2 && Idx == 1 &&
11454 X86::isZeroNode(Op.getOperand(0)) &&
11455 !X86::isZeroNode(Op.getOperand(1))) {
11456 unsigned NumBits = VT.getSizeInBits();
11457 return getVShift(true, VT,
11458 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11459 VT, Op.getOperand(1)),
11460 NumBits/2, DAG, *this, dl);
11461 }
11462
11463 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11464 return SDValue();
11465
11466 // Otherwise, if this is a vector with i32 or f32 elements, and the element
11467 // is a non-constant being inserted into an element other than the low one,
11468 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
11469 // movd/movss) to move this into the low element, then shuffle it into
11470 // place.
11471 if (EVTBits == 32) {
11472 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11473 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11474 }
11475 }
11476
11477 // Splat is obviously ok. Let legalizer expand it to a shuffle.
11478 if (Values.size() == 1) {
11479 if (EVTBits == 32) {
11480 // Instead of a shuffle like this:
11481 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11482 // Check if it's possible to issue this instead.
11483 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11484 unsigned Idx = NonZeroMask.countr_zero();
11485 SDValue Item = Op.getOperand(Idx);
11486 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11487 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11488 }
11489 return SDValue();
11490 }
11491
11492 // A vector full of immediates; various special cases are already
11493 // handled, so this is best done with a single constant-pool load.
11494 if (IsAllConstants)
11495 return SDValue();
11496
11497 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11498 return V;
11499
11500 // See if we can use a vector load to get all of the elements.
11501 {
11502 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11503 if (SDValue LD =
11504 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11505 return LD;
11506 }
11507
11508 // If this is a splat of pairs of 32-bit elements, we can use a narrower
11509 // build_vector and broadcast it.
11510 // TODO: We could probably generalize this more.
11511 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11512 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11513 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11514 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11515 // Make sure all the even/odd operands match.
11516 for (unsigned i = 2; i != NumElems; ++i)
11517 if (Ops[i % 2] != Op.getOperand(i))
11518 return false;
11519 return true;
11520 };
11521 if (CanSplat(Op, NumElems, Ops)) {
11522 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11523 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11524 // Create a new build vector and cast to v2i64/v2f64.
11525 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11526 DAG.getBuildVector(NarrowVT, dl, Ops));
11527 // Broadcast from v2i64/v2f64 and cast to final VT.
11528 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11529 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11530 NewBV));
11531 }
11532 }
11533
11534 // For AVX-length vectors, build the individual 128-bit pieces and use
11535 // shuffles to put them in place.
11536 if (VT.getSizeInBits() > 128) {
11537 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11538
11539 // Build both the lower and upper subvector.
11540 SDValue Lower =
11541 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11542 SDValue Upper = DAG.getBuildVector(
11543 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11544
11545 // Recreate the wider vector with the lower and upper part.
11546 return concatSubVectors(Lower, Upper, DAG, dl);
11547 }
11548
11549 // Let legalizer expand 2-wide build_vectors.
11550 if (EVTBits == 64) {
11551 if (NumNonZero == 1) {
11552 // One half is zero or undef.
11553 unsigned Idx = NonZeroMask.countr_zero();
11554 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11555 Op.getOperand(Idx));
11556 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11557 }
11558 return SDValue();
11559 }
11560
11561 // If element VT is < 32 bits, convert it to inserts into a zero vector.
11562 if (EVTBits == 8 && NumElems == 16)
11563 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11564 DAG, Subtarget))
11565 return V;
11566
11567 if (EltVT == MVT::i16 && NumElems == 8)
11568 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11569 DAG, Subtarget))
11570 return V;
11571
11572 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11573 if (EVTBits == 32 && NumElems == 4)
11574 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11575 return V;
11576
11577 // If element VT is == 32 bits, turn it into a number of shuffles.
11578 if (NumElems == 4 && NumZero > 0) {
11579 SmallVector<SDValue, 8> Ops(NumElems);
11580 for (unsigned i = 0; i < 4; ++i) {
11581 bool isZero = !NonZeroMask[i];
11582 if (isZero)
11583 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11584 else
11585 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11586 }
11587
11588 for (unsigned i = 0; i < 2; ++i) {
11589 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11590 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11590)
;
11591 case 0:
11592 Ops[i] = Ops[i*2]; // Must be a zero vector.
11593 break;
11594 case 1:
11595 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11596 break;
11597 case 2:
11598 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11599 break;
11600 case 3:
11601 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11602 break;
11603 }
11604 }
11605
11606 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11607 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11608 int MaskVec[] = {
11609 Reverse1 ? 1 : 0,
11610 Reverse1 ? 0 : 1,
11611 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11612 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
11613 };
11614 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11615 }
11616
11617 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11617, __extension__
__PRETTY_FUNCTION__))
;
11618
11619 // Check for a build vector from mostly shuffle plus few inserting.
11620 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11621 return Sh;
11622
11623 // For SSE 4.1, use insertps to put the high elements into the low element.
11624 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11625 SDValue Result;
11626 if (!Op.getOperand(0).isUndef())
11627 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11628 else
11629 Result = DAG.getUNDEF(VT);
11630
11631 for (unsigned i = 1; i < NumElems; ++i) {
11632 if (Op.getOperand(i).isUndef()) continue;
11633 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11634 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11635 }
11636 return Result;
11637 }
11638
11639 // Otherwise, expand into a number of unpckl*, start by extending each of
11640 // our (non-undef) elements to the full vector width with the element in the
11641 // bottom slot of the vector (which generates no code for SSE).
11642 SmallVector<SDValue, 8> Ops(NumElems);
11643 for (unsigned i = 0; i < NumElems; ++i) {
11644 if (!Op.getOperand(i).isUndef())
11645 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11646 else
11647 Ops[i] = DAG.getUNDEF(VT);
11648 }
11649
11650 // Next, we iteratively mix elements, e.g. for v4f32:
11651 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11652 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11653 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
11654 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11655 // Generate scaled UNPCKL shuffle mask.
11656 SmallVector<int, 16> Mask;
11657 for(unsigned i = 0; i != Scale; ++i)
11658 Mask.push_back(i);
11659 for (unsigned i = 0; i != Scale; ++i)
11660 Mask.push_back(NumElems+i);
11661 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11662
11663 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11664 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11665 }
11666 return Ops[0];
11667}
11668
11669// 256-bit AVX can use the vinsertf128 instruction
11670// to create 256-bit vectors from two other 128-bit ones.
11671// TODO: Detect subvector broadcast here instead of DAG combine?
11672static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11673 const X86Subtarget &Subtarget) {
11674 SDLoc dl(Op);
11675 MVT ResVT = Op.getSimpleValueType();
11676
11677 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__
__PRETTY_FUNCTION__))
11678 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__
__PRETTY_FUNCTION__))
;
11679
11680 unsigned NumOperands = Op.getNumOperands();
11681 unsigned NumFreezeUndef = 0;
11682 unsigned NumZero = 0;
11683 unsigned NumNonZero = 0;
11684 unsigned NonZeros = 0;
11685 for (unsigned i = 0; i != NumOperands; ++i) {
11686 SDValue SubVec = Op.getOperand(i);
11687 if (SubVec.isUndef())
11688 continue;
11689 if (ISD::isFreezeUndef(SubVec.getNode())) {
11690 // If the freeze(undef) has multiple uses then we must fold to zero.
11691 if (SubVec.hasOneUse())
11692 ++NumFreezeUndef;
11693 else
11694 ++NumZero;
11695 }
11696 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11697 ++NumZero;
11698 else {
11699 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11699, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11700 NonZeros |= 1 << i;
11701 ++NumNonZero;
11702 }
11703 }
11704
11705 // If we have more than 2 non-zeros, build each half separately.
11706 if (NumNonZero > 2) {
11707 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11708 ArrayRef<SDUse> Ops = Op->ops();
11709 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11710 Ops.slice(0, NumOperands/2));
11711 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11712 Ops.slice(NumOperands/2));
11713 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11714 }
11715
11716 // Otherwise, build it up through insert_subvectors.
11717 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11718 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
11719 : DAG.getUNDEF(ResVT));
11720
11721 MVT SubVT = Op.getOperand(0).getSimpleValueType();
11722 unsigned NumSubElems = SubVT.getVectorNumElements();
11723 for (unsigned i = 0; i != NumOperands; ++i) {
11724 if ((NonZeros & (1 << i)) == 0)
11725 continue;
11726
11727 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11728 Op.getOperand(i),
11729 DAG.getIntPtrConstant(i * NumSubElems, dl));
11730 }
11731
11732 return Vec;
11733}
11734
11735// Returns true if the given node is a type promotion (by concatenating i1
11736// zeros) of the result of a node that already zeros all upper bits of
11737// k-register.
11738// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11739static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11740 const X86Subtarget &Subtarget,
11741 SelectionDAG & DAG) {
11742 SDLoc dl(Op);
11743 MVT ResVT = Op.getSimpleValueType();
11744 unsigned NumOperands = Op.getNumOperands();
11745
11746 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__
__PRETTY_FUNCTION__))
11747 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__
__PRETTY_FUNCTION__))
;
11748
11749 uint64_t Zeros = 0;
11750 uint64_t NonZeros = 0;
11751 for (unsigned i = 0; i != NumOperands; ++i) {
11752 SDValue SubVec = Op.getOperand(i);
11753 if (SubVec.isUndef())
11754 continue;
11755 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11755, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11756 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11757 Zeros |= (uint64_t)1 << i;
11758 else
11759 NonZeros |= (uint64_t)1 << i;
11760 }
11761
11762 unsigned NumElems = ResVT.getVectorNumElements();
11763
11764 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11765 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11766 // insert_subvector will give us two kshifts.
11767 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11768 Log2_64(NonZeros) != NumOperands - 1) {
11769 MVT ShiftVT = ResVT;
11770 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11771 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11772 unsigned Idx = Log2_64(NonZeros);
11773 SDValue SubVec = Op.getOperand(Idx);
11774 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11775 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11776 DAG.getUNDEF(ShiftVT), SubVec,
11777 DAG.getIntPtrConstant(0, dl));
11778 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11779 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11780 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11781 DAG.getIntPtrConstant(0, dl));
11782 }
11783
11784 // If there are zero or one non-zeros we can handle this very simply.
11785 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11786 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11787 if (!NonZeros)
11788 return Vec;
11789 unsigned Idx = Log2_64(NonZeros);
11790 SDValue SubVec = Op.getOperand(Idx);
11791 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11792 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11793 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11794 }
11795
11796 if (NumOperands > 2) {
11797 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11798 ArrayRef<SDUse> Ops = Op->ops();
11799 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11800 Ops.slice(0, NumOperands/2));
11801 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11802 Ops.slice(NumOperands/2));
11803 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11804 }
11805
11806 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11806, __extension__
__PRETTY_FUNCTION__))
;
11807
11808 if (ResVT.getVectorNumElements() >= 16)
11809 return Op; // The operation is legal with KUNPCK
11810
11811 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11812 DAG.getUNDEF(ResVT), Op.getOperand(0),
11813 DAG.getIntPtrConstant(0, dl));
11814 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11815 DAG.getIntPtrConstant(NumElems/2, dl));
11816}
11817
11818static SDValue LowerCONCAT_VECTORS(SDValue Op,
11819 const X86Subtarget &Subtarget,
11820 SelectionDAG &DAG) {
11821 MVT VT = Op.getSimpleValueType();
11822 if (VT.getVectorElementType() == MVT::i1)
11823 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11824
11825 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))
11826 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))
11827 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))
;
11828
11829 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11830 // from two other 128-bit ones.
11831
11832 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11833 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11834}
11835
11836//===----------------------------------------------------------------------===//
11837// Vector shuffle lowering
11838//
11839// This is an experimental code path for lowering vector shuffles on x86. It is
11840// designed to handle arbitrary vector shuffles and blends, gracefully
11841// degrading performance as necessary. It works hard to recognize idiomatic
11842// shuffles and lower them to optimal instruction patterns without leaving
11843// a framework that allows reasonably efficient handling of all vector shuffle
11844// patterns.
11845//===----------------------------------------------------------------------===//
11846
11847/// Tiny helper function to identify a no-op mask.
11848///
11849/// This is a somewhat boring predicate function. It checks whether the mask
11850/// array input, which is assumed to be a single-input shuffle mask of the kind
11851/// used by the X86 shuffle instructions (not a fully general
11852/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11853/// in-place shuffle are 'no-op's.
11854static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11855 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11856 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
;
11857 if (Mask[i] >= 0 && Mask[i] != i)
11858 return false;
11859 }
11860 return true;
11861}
11862
11863/// Test whether there are elements crossing LaneSizeInBits lanes in this
11864/// shuffle mask.
11865///
11866/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11867/// and we routinely test for these.
11868static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11869 unsigned ScalarSizeInBits,
11870 ArrayRef<int> Mask) {
11871 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))
11872 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))
11873 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))
;
11874 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11875 int Size = Mask.size();
11876 for (int i = 0; i < Size; ++i)
11877 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11878 return true;
11879 return false;
11880}
11881
11882/// Test whether there are elements crossing 128-bit lanes in this
11883/// shuffle mask.
11884static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11885 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11886}
11887
11888/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11889/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11890/// better support 'repeated mask + lane permute' style shuffles.
11891static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11892 unsigned ScalarSizeInBits,
11893 ArrayRef<int> Mask) {
11894 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))
11895 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))
11896 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))
;
11897 int NumElts = Mask.size();
11898 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11899 int NumLanes = NumElts / NumEltsPerLane;
11900 if (NumLanes > 1) {
11901 for (int i = 0; i != NumLanes; ++i) {
11902 int SrcLane = -1;
11903 for (int j = 0; j != NumEltsPerLane; ++j) {
11904 int M = Mask[(i * NumEltsPerLane) + j];
11905 if (M < 0)
11906 continue;
11907 int Lane = (M % NumElts) / NumEltsPerLane;
11908 if (SrcLane >= 0 && SrcLane != Lane)
11909 return true;
11910 SrcLane = Lane;
11911 }
11912 }
11913 }
11914 return false;
11915}
11916
11917/// Test whether a shuffle mask is equivalent within each sub-lane.
11918///
11919/// This checks a shuffle mask to see if it is performing the same
11920/// lane-relative shuffle in each sub-lane. This trivially implies
11921/// that it is also not lane-crossing. It may however involve a blend from the
11922/// same lane of a second vector.
11923///
11924/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11925/// non-trivial to compute in the face of undef lanes. The representation is
11926/// suitable for use with existing 128-bit shuffles as entries from the second
11927/// vector have been remapped to [LaneSize, 2*LaneSize).
11928static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11929 ArrayRef<int> Mask,
11930 SmallVectorImpl<int> &RepeatedMask) {
11931 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11932 RepeatedMask.assign(LaneSize, -1);
11933 int Size = Mask.size();
11934 for (int i = 0; i < Size; ++i) {
11935 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11935, __extension__
__PRETTY_FUNCTION__))
;
11936 if (Mask[i] < 0)
11937 continue;
11938 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11939 // This entry crosses lanes, so there is no way to model this shuffle.
11940 return false;
11941
11942 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11943 // Adjust second vector indices to start at LaneSize instead of Size.
11944 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11945 : Mask[i] % LaneSize + LaneSize;
11946 if (RepeatedMask[i % LaneSize] < 0)
11947 // This is the first non-undef entry in this slot of a 128-bit lane.
11948 RepeatedMask[i % LaneSize] = LocalM;
11949 else if (RepeatedMask[i % LaneSize] != LocalM)
11950 // Found a mismatch with the repeated mask.
11951 return false;
11952 }
11953 return true;
11954}
11955
11956/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11957static bool
11958is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11959 SmallVectorImpl<int> &RepeatedMask) {
11960 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11961}
11962
11963static bool
11964is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11965 SmallVector<int, 32> RepeatedMask;
11966 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11967}
11968
11969/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11970static bool
11971is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11972 SmallVectorImpl<int> &RepeatedMask) {
11973 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11974}
11975
11976/// Test whether a target shuffle mask is equivalent within each sub-lane.
11977/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11978static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11979 unsigned EltSizeInBits,
11980 ArrayRef<int> Mask,
11981 SmallVectorImpl<int> &RepeatedMask) {
11982 int LaneSize = LaneSizeInBits / EltSizeInBits;
11983 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11984 int Size = Mask.size();
11985 for (int i = 0; i < Size; ++i) {
11986 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11986, __extension__
__PRETTY_FUNCTION__))
;
11987 if (Mask[i] == SM_SentinelUndef)
11988 continue;
11989 if (Mask[i] == SM_SentinelZero) {
11990 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11991 return false;
11992 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11993 continue;
11994 }
11995 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11996 // This entry crosses lanes, so there is no way to model this shuffle.
11997 return false;
11998
11999 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
12000 // later vector indices to start at multiples of LaneSize instead of Size.
12001 int LaneM = Mask[i] / Size;
12002 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
12003 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
12004 // This is the first non-undef entry in this slot of a 128-bit lane.
12005 RepeatedMask[i % LaneSize] = LocalM;
12006 else if (RepeatedMask[i % LaneSize] != LocalM)
12007 // Found a mismatch with the repeated mask.
12008 return false;
12009 }
12010 return true;
12011}
12012
12013/// Test whether a target shuffle mask is equivalent within each sub-lane.
12014/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
12015static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
12016 ArrayRef<int> Mask,
12017 SmallVectorImpl<int> &RepeatedMask) {
12018 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
12019 Mask, RepeatedMask);
12020}
12021
12022/// Checks whether the vector elements referenced by two shuffle masks are
12023/// equivalent.
12024static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
12025 int Idx, int ExpectedIdx) {
12026 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__
__PRETTY_FUNCTION__))
12027 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__
__PRETTY_FUNCTION__))
;
12028 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
12029 return false;
12030
12031 switch (Op.getOpcode()) {
12032 case ISD::BUILD_VECTOR:
12033 // If the values are build vectors, we can look through them to find
12034 // equivalent inputs that make the shuffles equivalent.
12035 // TODO: Handle MaskSize != Op.getNumOperands()?
12036 if (MaskSize == (int)Op.getNumOperands() &&
12037 MaskSize == (int)ExpectedOp.getNumOperands())
12038 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
12039 break;
12040 case X86ISD::VBROADCAST:
12041 case X86ISD::VBROADCAST_LOAD:
12042 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
12043 return (Op == ExpectedOp &&
12044 (int)Op.getValueType().getVectorNumElements() == MaskSize);
12045 case X86ISD::HADD:
12046 case X86ISD::HSUB:
12047 case X86ISD::FHADD:
12048 case X86ISD::FHSUB:
12049 case X86ISD::PACKSS:
12050 case X86ISD::PACKUS:
12051 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
12052 // TODO: Handle MaskSize != NumElts?
12053 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
12054 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
12055 MVT VT = Op.getSimpleValueType();
12056 int NumElts = VT.getVectorNumElements();
12057 if (MaskSize == NumElts) {
12058 int NumLanes = VT.getSizeInBits() / 128;
12059 int NumEltsPerLane = NumElts / NumLanes;
12060 int NumHalfEltsPerLane = NumEltsPerLane / 2;
12061 bool SameLane =
12062 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
12063 bool SameElt =
12064 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
12065 return SameLane && SameElt;
12066 }
12067 }
12068 break;
12069 }
12070
12071 return false;
12072}
12073
12074/// Checks whether a shuffle mask is equivalent to an explicit list of
12075/// arguments.
12076///
12077/// This is a fast way to test a shuffle mask against a fixed pattern:
12078///
12079/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
12080///
12081/// It returns true if the mask is exactly as wide as the argument list, and
12082/// each element of the mask is either -1 (signifying undef) or the value given
12083/// in the argument.
12084static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
12085 SDValue V1 = SDValue(),
12086 SDValue V2 = SDValue()) {
12087 int Size = Mask.size();
12088 if (Size != (int)ExpectedMask.size())
12089 return false;
12090
12091 for (int i = 0; i < Size; ++i) {
12092 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12092, __extension__
__PRETTY_FUNCTION__))
;
12093 int MaskIdx = Mask[i];
12094 int ExpectedIdx = ExpectedMask[i];
12095 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
12096 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12097 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12098 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12099 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12100 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12101 return false;
12102 }
12103 }
12104 return true;
12105}
12106
12107/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
12108///
12109/// The masks must be exactly the same width.
12110///
12111/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
12112/// value in ExpectedMask is always accepted. Otherwise the indices must match.
12113///
12114/// SM_SentinelZero is accepted as a valid negative index but must match in
12115/// both, or via a known bits test.
12116static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
12117 ArrayRef<int> ExpectedMask,
12118 const SelectionDAG &DAG,
12119 SDValue V1 = SDValue(),
12120 SDValue V2 = SDValue()) {
12121 int Size = Mask.size();
12122 if (Size != (int)ExpectedMask.size())
12123 return false;
12124 assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))
12125 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))
12126 "Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))
;
12127
12128 // Check for out-of-range target shuffle mask indices.
12129 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
12130 return false;
12131
12132 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
12133 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
12134 V1 = SDValue();
12135 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
12136 V2 = SDValue();
12137
12138 APInt ZeroV1 = APInt::getZero(Size);
12139 APInt ZeroV2 = APInt::getZero(Size);
12140
12141 for (int i = 0; i < Size; ++i) {
12142 int MaskIdx = Mask[i];
12143 int ExpectedIdx = ExpectedMask[i];
12144 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
12145 continue;
12146 if (MaskIdx == SM_SentinelZero) {
12147 // If we need this expected index to be a zero element, then update the
12148 // relevant zero mask and perform the known bits at the end to minimize
12149 // repeated computes.
12150 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12151 if (ExpectedV &&
12152 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
12153 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12154 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
12155 ZeroMask.setBit(BitIdx);
12156 continue;
12157 }
12158 }
12159 if (MaskIdx >= 0) {
12160 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12161 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12162 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12163 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12164 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12165 continue;
12166 }
12167 return false;
12168 }
12169 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
12170 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
12171}
12172
12173// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
12174// instructions.
12175static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
12176 const SelectionDAG &DAG) {
12177 if (VT != MVT::v8i32 && VT != MVT::v8f32)
12178 return false;
12179
12180 SmallVector<int, 8> Unpcklwd;
12181 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
12182 /* Unary = */ false);
12183 SmallVector<int, 8> Unpckhwd;
12184 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
12185 /* Unary = */ false);
12186 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
12187 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
12188 return IsUnpackwdMask;
12189}
12190
12191static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
12192 const SelectionDAG &DAG) {
12193 // Create 128-bit vector type based on mask size.
12194 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
12195 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
12196
12197 // We can't assume a canonical shuffle mask, so try the commuted version too.
12198 SmallVector<int, 4> CommutedMask(Mask);
12199 ShuffleVectorSDNode::commuteMask(CommutedMask);
12200
12201 // Match any of unary/binary or low/high.
12202 for (unsigned i = 0; i != 4; ++i) {
12203 SmallVector<int, 16> UnpackMask;
12204 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
12205 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
12206 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
12207 return true;
12208 }
12209 return false;
12210}
12211
12212/// Return true if a shuffle mask chooses elements identically in its top and
12213/// bottom halves. For example, any splat mask has the same top and bottom
12214/// halves. If an element is undefined in only one half of the mask, the halves
12215/// are not considered identical.
12216static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
12217 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12217, __extension__
__PRETTY_FUNCTION__))
;
12218 unsigned HalfSize = Mask.size() / 2;
12219 for (unsigned i = 0; i != HalfSize; ++i) {
12220 if (Mask[i] != Mask[i + HalfSize])
12221 return false;
12222 }
12223 return true;
12224}
12225
12226/// Get a 4-lane 8-bit shuffle immediate for a mask.
12227///
12228/// This helper function produces an 8-bit shuffle immediate corresponding to
12229/// the ubiquitous shuffle encoding scheme used in x86 instructions for
12230/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
12231/// example.
12232///
12233/// NB: We rely heavily on "undef" masks preserving the input lane.
12234static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
12235 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12235, __extension__
__PRETTY_FUNCTION__))
;
12236 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12236, __extension__
__PRETTY_FUNCTION__))
;
12237 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__))
;
12238 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12238, __extension__
__PRETTY_FUNCTION__))
;
12239 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__
__PRETTY_FUNCTION__))
;
12240
12241 // If the mask only uses one non-undef element, then fully 'splat' it to
12242 // improve later broadcast matching.
12243 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
12244 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12244, __extension__
__PRETTY_FUNCTION__))
;
12245
12246 int FirstElt = Mask[FirstIndex];
12247 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
12248 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
12249
12250 unsigned Imm = 0;
12251 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
12252 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
12253 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
12254 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
12255 return Imm;
12256}
12257
12258static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
12259 SelectionDAG &DAG) {
12260 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
12261}
12262
12263// The Shuffle result is as follow:
12264// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
12265// Each Zeroable's element correspond to a particular Mask's element.
12266// As described in computeZeroableShuffleElements function.
12267//
12268// The function looks for a sub-mask that the nonzero elements are in
12269// increasing order. If such sub-mask exist. The function returns true.
12270static bool isNonZeroElementsInOrder(const APInt &Zeroable,
12271 ArrayRef<int> Mask, const EVT &VectorType,
12272 bool &IsZeroSideLeft) {
12273 int NextElement = -1;
12274 // Check if the Mask's nonzero elements are in increasing order.
12275 for (int i = 0, e = Mask.size(); i < e; i++) {
12276 // Checks if the mask's zeros elements are built from only zeros.
12277 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12277, __extension__
__PRETTY_FUNCTION__))
;
12278 if (Mask[i] < 0)
12279 return false;
12280 if (Zeroable[i])
12281 continue;
12282 // Find the lowest non zero element
12283 if (NextElement < 0) {
12284 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
12285 IsZeroSideLeft = NextElement != 0;
12286 }
12287 // Exit if the mask's non zero elements are not in increasing order.
12288 if (NextElement != Mask[i])
12289 return false;
12290 NextElement++;
12291 }
12292 return true;
12293}
12294
12295/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
12296static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
12297 ArrayRef<int> Mask, SDValue V1,
12298 SDValue V2, const APInt &Zeroable,
12299 const X86Subtarget &Subtarget,
12300 SelectionDAG &DAG) {
12301 int Size = Mask.size();
12302 int LaneSize = 128 / VT.getScalarSizeInBits();
12303 const int NumBytes = VT.getSizeInBits() / 8;
12304 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
12305
12306 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))
12307 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))
12308 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))
;
12309
12310 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
12311 // Sign bit set in i8 mask means zero element.
12312 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
12313
12314 SDValue V;
12315 for (int i = 0; i < NumBytes; ++i) {
12316 int M = Mask[i / NumEltBytes];
12317 if (M < 0) {
12318 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
12319 continue;
12320 }
12321 if (Zeroable[i / NumEltBytes]) {
12322 PSHUFBMask[i] = ZeroMask;
12323 continue;
12324 }
12325
12326 // We can only use a single input of V1 or V2.
12327 SDValue SrcV = (M >= Size ? V2 : V1);
12328 if (V && V != SrcV)
12329 return SDValue();
12330 V = SrcV;
12331 M %= Size;
12332
12333 // PSHUFB can't cross lanes, ensure this doesn't happen.
12334 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
12335 return SDValue();
12336
12337 M = M % LaneSize;
12338 M = M * NumEltBytes + (i % NumEltBytes);
12339 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
12340 }
12341 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12341, __extension__
__PRETTY_FUNCTION__))
;
12342
12343 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
12344 return DAG.getBitcast(
12345 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
12346 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
12347}
12348
12349static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
12350 const X86Subtarget &Subtarget, SelectionDAG &DAG,
12351 const SDLoc &dl);
12352
12353// X86 has dedicated shuffle that can be lowered to VEXPAND
12354static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
12355 const APInt &Zeroable,
12356 ArrayRef<int> Mask, SDValue &V1,
12357 SDValue &V2, SelectionDAG &DAG,
12358 const X86Subtarget &Subtarget) {
12359 bool IsLeftZeroSide = true;
12360 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
12361 IsLeftZeroSide))
12362 return SDValue();
12363 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
12364 MVT IntegerType =
12365 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12366 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
12367 unsigned NumElts = VT.getVectorNumElements();
12368 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__
__PRETTY_FUNCTION__))
12369 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__
__PRETTY_FUNCTION__))
;
12370 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
12371 Subtarget, DAG, DL);
12372 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
12373 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
12374 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
12375}
12376
12377static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
12378 unsigned &UnpackOpcode, bool IsUnary,
12379 ArrayRef<int> TargetMask, const SDLoc &DL,
12380 SelectionDAG &DAG,
12381 const X86Subtarget &Subtarget) {
12382 int NumElts = VT.getVectorNumElements();
12383
12384 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
12385 for (int i = 0; i != NumElts; i += 2) {
12386 int M1 = TargetMask[i + 0];
12387 int M2 = TargetMask[i + 1];
12388 Undef1 &= (SM_SentinelUndef == M1);
12389 Undef2 &= (SM_SentinelUndef == M2);
12390 Zero1 &= isUndefOrZero(M1);
12391 Zero2 &= isUndefOrZero(M2);
12392 }
12393 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__
__PRETTY_FUNCTION__))
12394 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__
__PRETTY_FUNCTION__))
;
12395
12396 // Attempt to match the target mask against the unpack lo/hi mask patterns.
12397 SmallVector<int, 64> Unpckl, Unpckh;
12398 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
12399 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
12400 (IsUnary ? V1 : V2))) {
12401 UnpackOpcode = X86ISD::UNPCKL;
12402 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12403 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12404 return true;
12405 }
12406
12407 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
12408 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
12409 (IsUnary ? V1 : V2))) {
12410 UnpackOpcode = X86ISD::UNPCKH;
12411 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12412 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12413 return true;
12414 }
12415
12416 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
12417 if (IsUnary && (Zero1 || Zero2)) {
12418 // Don't bother if we can blend instead.
12419 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
12420 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
12421 return false;
12422
12423 bool MatchLo = true, MatchHi = true;
12424 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
12425 int M = TargetMask[i];
12426
12427 // Ignore if the input is known to be zero or the index is undef.
12428 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
12429 (M == SM_SentinelUndef))
12430 continue;
12431
12432 MatchLo &= (M == Unpckl[i]);
12433 MatchHi &= (M == Unpckh[i]);
12434 }
12435
12436 if (MatchLo || MatchHi) {
12437 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12438 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12439 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12440 return true;
12441 }
12442 }
12443
12444 // If a binary shuffle, commute and try again.
12445 if (!IsUnary) {
12446 ShuffleVectorSDNode::commuteMask(Unpckl);
12447 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
12448 UnpackOpcode = X86ISD::UNPCKL;
12449 std::swap(V1, V2);
12450 return true;
12451 }
12452
12453 ShuffleVectorSDNode::commuteMask(Unpckh);
12454 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
12455 UnpackOpcode = X86ISD::UNPCKH;
12456 std::swap(V1, V2);
12457 return true;
12458 }
12459 }
12460
12461 return false;
12462}
12463
12464// X86 has dedicated unpack instructions that can handle specific blend
12465// operations: UNPCKH and UNPCKL.
12466static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12467 ArrayRef<int> Mask, SDValue V1, SDValue V2,
12468 SelectionDAG &DAG) {
12469 SmallVector<int, 8> Unpckl;
12470 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12471 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12472 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12473
12474 SmallVector<int, 8> Unpckh;
12475 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12476 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12477 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12478
12479 // Commute and try again.
12480 ShuffleVectorSDNode::commuteMask(Unpckl);
12481 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12482 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12483
12484 ShuffleVectorSDNode::commuteMask(Unpckh);
12485 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12486 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12487
12488 return SDValue();
12489}
12490
12491/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12492/// followed by unpack 256-bit.
12493static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12494 ArrayRef<int> Mask, SDValue V1,
12495 SDValue V2, SelectionDAG &DAG) {
12496 SmallVector<int, 32> Unpckl, Unpckh;
12497 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12498 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12499
12500 unsigned UnpackOpcode;
12501 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12502 UnpackOpcode = X86ISD::UNPCKL;
12503 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12504 UnpackOpcode = X86ISD::UNPCKH;
12505 else
12506 return SDValue();
12507
12508 // This is a "natural" unpack operation (rather than the 128-bit sectored
12509 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12510 // input in order to use the x86 instruction.
12511 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12512 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12513 V1 = DAG.getBitcast(VT, V1);
12514 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12515}
12516
12517// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12518// source into the lower elements and zeroing the upper elements.
12519static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12520 ArrayRef<int> Mask, const APInt &Zeroable,
12521 const X86Subtarget &Subtarget) {
12522 if (!VT.is512BitVector() && !Subtarget.hasVLX())
12523 return false;
12524
12525 unsigned NumElts = Mask.size();
12526 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12527 unsigned MaxScale = 64 / EltSizeInBits;
12528
12529 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12530 unsigned SrcEltBits = EltSizeInBits * Scale;
12531 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12532 continue;
12533 unsigned NumSrcElts = NumElts / Scale;
12534 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12535 continue;
12536 unsigned UpperElts = NumElts - NumSrcElts;
12537 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12538 continue;
12539 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12540 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12541 DstVT = MVT::getIntegerVT(EltSizeInBits);
12542 if ((NumSrcElts * EltSizeInBits) >= 128) {
12543 // ISD::TRUNCATE
12544 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12545 } else {
12546 // X86ISD::VTRUNC
12547 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12548 }
12549 return true;
12550 }
12551
12552 return false;
12553}
12554
12555// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12556// element padding to the final DstVT.
12557static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12558 const X86Subtarget &Subtarget,
12559 SelectionDAG &DAG, bool ZeroUppers) {
12560 MVT SrcVT = Src.getSimpleValueType();
12561 MVT DstSVT = DstVT.getScalarType();
12562 unsigned NumDstElts = DstVT.getVectorNumElements();
12563 unsigned NumSrcElts = SrcVT.getVectorNumElements();
12564 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12565
12566 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12567 return SDValue();
12568
12569 // Perform a direct ISD::TRUNCATE if possible.
12570 if (NumSrcElts == NumDstElts)
12571 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12572
12573 if (NumSrcElts > NumDstElts) {
12574 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12575 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12576 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12577 }
12578
12579 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12580 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12581 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12582 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12583 DstVT.getSizeInBits());
12584 }
12585
12586 // Non-VLX targets must truncate from a 512-bit type, so we need to
12587 // widen, truncate and then possibly extract the original subvector.
12588 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12589 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12590 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12591 }
12592
12593 // Fallback to a X86ISD::VTRUNC, padding if necessary.
12594 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12595 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12596 if (DstVT != TruncVT)
12597 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12598 DstVT.getSizeInBits());
12599 return Trunc;
12600}
12601
12602// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12603//
12604// An example is the following:
12605//
12606// t0: ch = EntryToken
12607// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12608// t25: v4i32 = truncate t2
12609// t41: v8i16 = bitcast t25
12610// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12611// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12612// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12613// t18: v2i64 = bitcast t51
12614//
12615// One can just use a single vpmovdw instruction, without avx512vl we need to
12616// use the zmm variant and extract the lower subvector, padding with zeroes.
12617// TODO: Merge with lowerShuffleAsVTRUNC.
12618static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12619 SDValue V2, ArrayRef<int> Mask,
12620 const APInt &Zeroable,
12621 const X86Subtarget &Subtarget,
12622 SelectionDAG &DAG) {
12623 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12623, __extension__
__PRETTY_FUNCTION__))
;
12624 if (!Subtarget.hasAVX512())
12625 return SDValue();
12626
12627 unsigned NumElts = VT.getVectorNumElements();
12628 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12629 unsigned MaxScale = 64 / EltSizeInBits;
12630 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12631 unsigned SrcEltBits = EltSizeInBits * Scale;
12632 unsigned NumSrcElts = NumElts / Scale;
12633 unsigned UpperElts = NumElts - NumSrcElts;
12634 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12635 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12636 continue;
12637
12638 // Attempt to find a matching source truncation, but as a fall back VLX
12639 // cases can use the VPMOV directly.
12640 SDValue Src = peekThroughBitcasts(V1);
12641 if (Src.getOpcode() == ISD::TRUNCATE &&
12642 Src.getScalarValueSizeInBits() == SrcEltBits) {
12643 Src = Src.getOperand(0);
12644 } else if (Subtarget.hasVLX()) {
12645 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12646 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12647 Src = DAG.getBitcast(SrcVT, Src);
12648 // Don't do this if PACKSS/PACKUS could perform it cheaper.
12649 if (Scale == 2 &&
12650 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
12651 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
12652 return SDValue();
12653 } else
12654 return SDValue();
12655
12656 // VPMOVWB is only available with avx512bw.
12657 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
12658 return SDValue();
12659
12660 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12661 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12662 }
12663
12664 return SDValue();
12665}
12666
12667// Attempt to match binary shuffle patterns as a truncate.
12668static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12669 SDValue V2, ArrayRef<int> Mask,
12670 const APInt &Zeroable,
12671 const X86Subtarget &Subtarget,
12672 SelectionDAG &DAG) {
12673 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__
__PRETTY_FUNCTION__))
12674 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__
__PRETTY_FUNCTION__))
;
12675 if (!Subtarget.hasAVX512())
12676 return SDValue();
12677
12678 unsigned NumElts = VT.getVectorNumElements();
12679 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12680 unsigned MaxScale = 64 / EltSizeInBits;
12681 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12682 // TODO: Support non-BWI VPMOVWB truncations?
12683 unsigned SrcEltBits = EltSizeInBits * Scale;
12684 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12685 continue;
12686
12687 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
12688 // Bail if the V2 elements are undef.
12689 unsigned NumHalfSrcElts = NumElts / Scale;
12690 unsigned NumSrcElts = 2 * NumHalfSrcElts;
12691 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
12692 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
12693 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12694 continue;
12695
12696 // The elements beyond the truncation must be undef/zero.
12697 unsigned UpperElts = NumElts - NumSrcElts;
12698 if (UpperElts > 0 &&
12699 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12700 continue;
12701 bool UndefUppers =
12702 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12703
12704 // For offset truncations, ensure that the concat is cheap.
12705 if (Offset) {
12706 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
12707 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12708 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12709 return Lo.getOperand(0) == Hi.getOperand(0);
12710 if (ISD::isNormalLoad(Lo.getNode()) &&
12711 ISD::isNormalLoad(Hi.getNode())) {
12712 auto *LDLo = cast<LoadSDNode>(Lo);
12713 auto *LDHi = cast<LoadSDNode>(Hi);
12714 return DAG.areNonVolatileConsecutiveLoads(
12715 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
12716 }
12717 return false;
12718 };
12719 if (!IsCheapConcat(V1, V2))
12720 continue;
12721 }
12722
12723 // As we're using both sources then we need to concat them together
12724 // and truncate from the double-sized src.
12725 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12726 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12727
12728 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12729 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12730 Src = DAG.getBitcast(SrcVT, Src);
12731
12732 // Shift the offset'd elements into place for the truncation.
12733 // TODO: Use getTargetVShiftByConstNode.
12734 if (Offset)
12735 Src = DAG.getNode(
12736 X86ISD::VSRLI, DL, SrcVT, Src,
12737 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
12738
12739 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12740 }
12741 }
12742
12743 return SDValue();
12744}
12745
12746/// Check whether a compaction lowering can be done by dropping even/odd
12747/// elements and compute how many times even/odd elements must be dropped.
12748///
12749/// This handles shuffles which take every Nth element where N is a power of
12750/// two. Example shuffle masks:
12751///
12752/// (even)
12753/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12754/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12755/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12756/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12757/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12758/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12759///
12760/// (odd)
12761/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
12762/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12763///
12764/// Any of these lanes can of course be undef.
12765///
12766/// This routine only supports N <= 3.
12767/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12768/// for larger N.
12769///
12770/// \returns N above, or the number of times even/odd elements must be dropped
12771/// if there is such a number. Otherwise returns zero.
12772static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12773 bool IsSingleInput) {
12774 // The modulus for the shuffle vector entries is based on whether this is
12775 // a single input or not.
12776 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12777 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__
__PRETTY_FUNCTION__))
12778 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__
__PRETTY_FUNCTION__))
;
12779
12780 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12781 int Offset = MatchEven ? 0 : 1;
12782
12783 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12784 // and 2^3 simultaneously. This is because we may have ambiguity with
12785 // partially undef inputs.
12786 bool ViableForN[3] = {true, true, true};
12787
12788 for (int i = 0, e = Mask.size(); i < e; ++i) {
12789 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12790 // want.
12791 if (Mask[i] < 0)
12792 continue;
12793
12794 bool IsAnyViable = false;
12795 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12796 if (ViableForN[j]) {
12797 uint64_t N = j + 1;
12798
12799 // The shuffle mask must be equal to (i * 2^N) % M.
12800 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12801 IsAnyViable = true;
12802 else
12803 ViableForN[j] = false;
12804 }
12805 // Early exit if we exhaust the possible powers of two.
12806 if (!IsAnyViable)
12807 break;
12808 }
12809
12810 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12811 if (ViableForN[j])
12812 return j + 1;
12813
12814 // Return 0 as there is no viable power of two.
12815 return 0;
12816}
12817
12818// X86 has dedicated pack instructions that can handle specific truncation
12819// operations: PACKSS and PACKUS.
12820// Checks for compaction shuffle masks if MaxStages > 1.
12821// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12822static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12823 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12824 const SelectionDAG &DAG,
12825 const X86Subtarget &Subtarget,
12826 unsigned MaxStages = 1) {
12827 unsigned NumElts = VT.getVectorNumElements();
12828 unsigned BitSize = VT.getScalarSizeInBits();
12829 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__
__PRETTY_FUNCTION__))
12830 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__
__PRETTY_FUNCTION__))
;
12831
12832 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12833 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12834 unsigned NumPackedBits = NumSrcBits - BitSize;
12835 N1 = peekThroughBitcasts(N1);
12836 N2 = peekThroughBitcasts(N2);
12837 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12838 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12839 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12840 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12841 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12842 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12843 return false;
12844 if (Subtarget.hasSSE41() || BitSize == 8) {
12845 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12846 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12847 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12848 V1 = N1;
12849 V2 = N2;
12850 SrcVT = PackVT;
12851 PackOpcode = X86ISD::PACKUS;
12852 return true;
12853 }
12854 }
12855 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12856 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12857 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12858 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12859 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12860 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12861 V1 = N1;
12862 V2 = N2;
12863 SrcVT = PackVT;
12864 PackOpcode = X86ISD::PACKSS;
12865 return true;
12866 }
12867 return false;
12868 };
12869
12870 // Attempt to match against wider and wider compaction patterns.
12871 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12872 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12873 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12874
12875 // Try binary shuffle.
12876 SmallVector<int, 32> BinaryMask;
12877 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12878 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
12879 if (MatchPACK(V1, V2, PackVT))
12880 return true;
12881
12882 // Try unary shuffle.
12883 SmallVector<int, 32> UnaryMask;
12884 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12885 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
12886 if (MatchPACK(V1, V1, PackVT))
12887 return true;
12888 }
12889
12890 return false;
12891}
12892
12893static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12894 SDValue V1, SDValue V2, SelectionDAG &DAG,
12895 const X86Subtarget &Subtarget) {
12896 MVT PackVT;
12897 unsigned PackOpcode;
12898 unsigned SizeBits = VT.getSizeInBits();
12899 unsigned EltBits = VT.getScalarSizeInBits();
12900 unsigned MaxStages = Log2_32(64 / EltBits);
12901 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12902 Subtarget, MaxStages))
12903 return SDValue();
12904
12905 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12906 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12907
12908 // Don't lower multi-stage packs on AVX512, truncation is better.
12909 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12910 return SDValue();
12911
12912 // Pack to the largest type possible:
12913 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12914 unsigned MaxPackBits = 16;
12915 if (CurrentEltBits > 16 &&
12916 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12917 MaxPackBits = 32;
12918
12919 // Repeatedly pack down to the target size.
12920 SDValue Res;
12921 for (unsigned i = 0; i != NumStages; ++i) {
12922 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12923 unsigned NumSrcElts = SizeBits / SrcEltBits;
12924 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12925 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12926 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12927 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12928 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12929 DAG.getBitcast(SrcVT, V2));
12930 V1 = V2 = Res;
12931 CurrentEltBits /= 2;
12932 }
12933 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__
__PRETTY_FUNCTION__))
12934 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__
__PRETTY_FUNCTION__))
;
12935 return Res;
12936}
12937
12938/// Try to emit a bitmask instruction for a shuffle.
12939///
12940/// This handles cases where we can model a blend exactly as a bitmask due to
12941/// one of the inputs being zeroable.
12942static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12943 SDValue V2, ArrayRef<int> Mask,
12944 const APInt &Zeroable,
12945 const X86Subtarget &Subtarget,
12946 SelectionDAG &DAG) {
12947 MVT MaskVT = VT;
12948 MVT EltVT = VT.getVectorElementType();
12949 SDValue Zero, AllOnes;
12950 // Use f64 if i64 isn't legal.
12951 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12952 EltVT = MVT::f64;
12953 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12954 }
12955
12956 MVT LogicVT = VT;
12957 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12958 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12959 APFloat AllOnesValue =
12960 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12961 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12962 LogicVT =
12963 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12964 } else {
12965 Zero = DAG.getConstant(0, DL, EltVT);
12966 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12967 }
12968
12969 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12970 SDValue V;
12971 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12972 if (Zeroable[i])
12973 continue;
12974 if (Mask[i] % Size != i)
12975 return SDValue(); // Not a blend.
12976 if (!V)
12977 V = Mask[i] < Size ? V1 : V2;
12978 else if (V != (Mask[i] < Size ? V1 : V2))
12979 return SDValue(); // Can only let one input through the mask.
12980
12981 VMaskOps[i] = AllOnes;
12982 }
12983 if (!V)
12984 return SDValue(); // No non-zeroable elements!
12985
12986 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12987 VMask = DAG.getBitcast(LogicVT, VMask);
12988 V = DAG.getBitcast(LogicVT, V);
12989 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12990 return DAG.getBitcast(VT, And);
12991}
12992
12993/// Try to emit a blend instruction for a shuffle using bit math.
12994///
12995/// This is used as a fallback approach when first class blend instructions are
12996/// unavailable. Currently it is only suitable for integer vectors, but could
12997/// be generalized for floating point vectors if desirable.
12998static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12999 SDValue V2, ArrayRef<int> Mask,
13000 SelectionDAG &DAG) {
13001 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__
__PRETTY_FUNCTION__))
;
13002 MVT EltVT = VT.getVectorElementType();
13003 SDValue Zero = DAG.getConstant(0, DL, EltVT);
13004 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
13005 SmallVector<SDValue, 16> MaskOps;
13006 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13007 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
13008 return SDValue(); // Shuffled input!
13009 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
13010 }
13011
13012 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
13013 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
13014 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
13015 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
13016}
13017
13018static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
13019 SDValue PreservedSrc,
13020 const X86Subtarget &Subtarget,
13021 SelectionDAG &DAG);
13022
13023static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
13024 MutableArrayRef<int> Mask,
13025 const APInt &Zeroable, bool &ForceV1Zero,
13026 bool &ForceV2Zero, uint64_t &BlendMask) {
13027 bool V1IsZeroOrUndef =
13028 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
13029 bool V2IsZeroOrUndef =
13030 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
13031
13032 BlendMask = 0;
13033 ForceV1Zero = false, ForceV2Zero = false;
13034 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13034, __extension__
__PRETTY_FUNCTION__))
;
13035
13036 int NumElts = Mask.size();
13037 int NumLanes = VT.getSizeInBits() / 128;
13038 int NumEltsPerLane = NumElts / NumLanes;
13039 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13039, __extension__
__PRETTY_FUNCTION__))
;
13040
13041 // For 32/64-bit elements, if we only reference one input (plus any undefs),
13042 // then ensure the blend mask part for that lane just references that input.
13043 bool ForceWholeLaneMasks =
13044 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
13045
13046 // Attempt to generate the binary blend mask. If an input is zero then
13047 // we can use any lane.
13048 for (int Lane = 0; Lane != NumLanes; ++Lane) {
13049 // Keep track of the inputs used per lane.
13050 bool LaneV1InUse = false;
13051 bool LaneV2InUse = false;
13052 uint64_t LaneBlendMask = 0;
13053 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
13054 int Elt = (Lane * NumEltsPerLane) + LaneElt;
13055 int M = Mask[Elt];
13056 if (M == SM_SentinelUndef)
13057 continue;
13058 if (M == Elt || (0 <= M && M < NumElts &&
13059 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
13060 Mask[Elt] = Elt;
13061 LaneV1InUse = true;
13062 continue;
13063 }
13064 if (M == (Elt + NumElts) ||
13065 (NumElts <= M &&
13066 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
13067 LaneBlendMask |= 1ull << LaneElt;
13068 Mask[Elt] = Elt + NumElts;
13069 LaneV2InUse = true;
13070 continue;
13071 }
13072 if (Zeroable[Elt]) {
13073 if (V1IsZeroOrUndef) {
13074 ForceV1Zero = true;
13075 Mask[Elt] = Elt;
13076 LaneV1InUse = true;
13077 continue;
13078 }
13079 if (V2IsZeroOrUndef) {
13080 ForceV2Zero = true;
13081 LaneBlendMask |= 1ull << LaneElt;
13082 Mask[Elt] = Elt + NumElts;
13083 LaneV2InUse = true;
13084 continue;
13085 }
13086 }
13087 return false;
13088 }
13089
13090 // If we only used V2 then splat the lane blend mask to avoid any demanded
13091 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
13092 // blend mask bit).
13093 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
13094 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
13095
13096 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
13097 }
13098 return true;
13099}
13100
13101static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
13102 int Scale) {
13103 uint64_t ScaledMask = 0;
13104 for (int i = 0; i != Size; ++i)
13105 if (BlendMask & (1ull << i))
13106 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
13107 return ScaledMask;
13108}
13109
13110/// Try to emit a blend instruction for a shuffle.
13111///
13112/// This doesn't do any checks for the availability of instructions for blending
13113/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
13114/// be matched in the backend with the type given. What it does check for is
13115/// that the shuffle mask is a blend, or convertible into a blend with zero.
13116static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
13117 SDValue V2, ArrayRef<int> Original,
13118 const APInt &Zeroable,
13119 const X86Subtarget &Subtarget,
13120 SelectionDAG &DAG) {
13121 uint64_t BlendMask = 0;
13122 bool ForceV1Zero = false, ForceV2Zero = false;
13123 SmallVector<int, 64> Mask(Original);
13124 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
13125 BlendMask))
13126 return SDValue();
13127
13128 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
13129 if (ForceV1Zero)
13130 V1 = getZeroVector(VT, Subtarget, DAG, DL);
13131 if (ForceV2Zero)
13132 V2 = getZeroVector(VT, Subtarget, DAG, DL);
13133
13134 unsigned NumElts = VT.getVectorNumElements();
13135
13136 switch (VT.SimpleTy) {
13137 case MVT::v4i64:
13138 case MVT::v8i32:
13139 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13139, __extension__
__PRETTY_FUNCTION__))
;
13140 [[fallthrough]];
13141 case MVT::v4f64:
13142 case MVT::v8f32:
13143 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13143, __extension__
__PRETTY_FUNCTION__))
;
13144 [[fallthrough]];
13145 case MVT::v2f64:
13146 case MVT::v2i64:
13147 case MVT::v4f32:
13148 case MVT::v4i32:
13149 case MVT::v8i16:
13150 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13150, __extension__
__PRETTY_FUNCTION__))
;
13151 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
13152 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13153 case MVT::v16i16: {
13154 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13154, __extension__
__PRETTY_FUNCTION__))
;
13155 SmallVector<int, 8> RepeatedMask;
13156 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13157 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
13158 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13158, __extension__
__PRETTY_FUNCTION__))
;
13159 BlendMask = 0;
13160 for (int i = 0; i < 8; ++i)
13161 if (RepeatedMask[i] >= 8)
13162 BlendMask |= 1ull << i;
13163 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13164 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13165 }
13166 // Use PBLENDW for lower/upper lanes and then blend lanes.
13167 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
13168 // merge to VSELECT where useful.
13169 uint64_t LoMask = BlendMask & 0xFF;
13170 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
13171 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
13172 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13173 DAG.getTargetConstant(LoMask, DL, MVT::i8));
13174 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13175 DAG.getTargetConstant(HiMask, DL, MVT::i8));
13176 return DAG.getVectorShuffle(
13177 MVT::v16i16, DL, Lo, Hi,
13178 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
13179 }
13180 [[fallthrough]];
13181 }
13182 case MVT::v32i8:
13183 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13183, __extension__
__PRETTY_FUNCTION__))
;
13184 [[fallthrough]];
13185 case MVT::v16i8: {
13186 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13186, __extension__
__PRETTY_FUNCTION__))
;
13187
13188 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
13189 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13190 Subtarget, DAG))
13191 return Masked;
13192
13193 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
13194 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13195 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13196 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13197 }
13198
13199 // If we have VPTERNLOG, we can use that as a bit blend.
13200 if (Subtarget.hasVLX())
13201 if (SDValue BitBlend =
13202 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13203 return BitBlend;
13204
13205 // Scale the blend by the number of bytes per element.
13206 int Scale = VT.getScalarSizeInBits() / 8;
13207
13208 // This form of blend is always done on bytes. Compute the byte vector
13209 // type.
13210 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13211
13212 // x86 allows load folding with blendvb from the 2nd source operand. But
13213 // we are still using LLVM select here (see comment below), so that's V1.
13214 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
13215 // allow that load-folding possibility.
13216 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
13217 ShuffleVectorSDNode::commuteMask(Mask);
13218 std::swap(V1, V2);
13219 }
13220
13221 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
13222 // mix of LLVM's code generator and the x86 backend. We tell the code
13223 // generator that boolean values in the elements of an x86 vector register
13224 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
13225 // mapping a select to operand #1, and 'false' mapping to operand #2. The
13226 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
13227 // of the element (the remaining are ignored) and 0 in that high bit would
13228 // mean operand #1 while 1 in the high bit would mean operand #2. So while
13229 // the LLVM model for boolean values in vector elements gets the relevant
13230 // bit set, it is set backwards and over constrained relative to x86's
13231 // actual model.
13232 SmallVector<SDValue, 32> VSELECTMask;
13233 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13234 for (int j = 0; j < Scale; ++j)
13235 VSELECTMask.push_back(
13236 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
13237 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
13238 MVT::i8));
13239
13240 V1 = DAG.getBitcast(BlendVT, V1);
13241 V2 = DAG.getBitcast(BlendVT, V2);
13242 return DAG.getBitcast(
13243 VT,
13244 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
13245 V1, V2));
13246 }
13247 case MVT::v16f32:
13248 case MVT::v8f64:
13249 case MVT::v8i64:
13250 case MVT::v16i32:
13251 case MVT::v32i16:
13252 case MVT::v64i8: {
13253 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
13254 bool OptForSize = DAG.shouldOptForSize();
13255 if (!OptForSize) {
13256 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13257 Subtarget, DAG))
13258 return Masked;
13259 }
13260
13261 // Otherwise load an immediate into a GPR, cast to k-register, and use a
13262 // masked move.
13263 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13264 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13265 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13266 }
13267 default:
13268 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13268)
;
13269 }
13270}
13271
13272/// Try to lower as a blend of elements from two inputs followed by
13273/// a single-input permutation.
13274///
13275/// This matches the pattern where we can blend elements from two inputs and
13276/// then reduce the shuffle to a single-input permutation.
13277static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
13278 SDValue V1, SDValue V2,
13279 ArrayRef<int> Mask,
13280 SelectionDAG &DAG,
13281 bool ImmBlends = false) {
13282 // We build up the blend mask while checking whether a blend is a viable way
13283 // to reduce the shuffle.
13284 SmallVector<int, 32> BlendMask(Mask.size(), -1);
13285 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
13286
13287 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13288 if (Mask[i] < 0)
13289 continue;
13290
13291 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13291, __extension__
__PRETTY_FUNCTION__))
;
13292
13293 if (BlendMask[Mask[i] % Size] < 0)
13294 BlendMask[Mask[i] % Size] = Mask[i];
13295 else if (BlendMask[Mask[i] % Size] != Mask[i])
13296 return SDValue(); // Can't blend in the needed input!
13297
13298 PermuteMask[i] = Mask[i] % Size;
13299 }
13300
13301 // If only immediate blends, then bail if the blend mask can't be widened to
13302 // i16.
13303 unsigned EltSize = VT.getScalarSizeInBits();
13304 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
13305 return SDValue();
13306
13307 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
13308 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
13309}
13310
13311/// Try to lower as an unpack of elements from two inputs followed by
13312/// a single-input permutation.
13313///
13314/// This matches the pattern where we can unpack elements from two inputs and
13315/// then reduce the shuffle to a single-input (wider) permutation.
13316static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
13317 SDValue V1, SDValue V2,
13318 ArrayRef<int> Mask,
13319 SelectionDAG &DAG) {
13320 int NumElts = Mask.size();
13321 int NumLanes = VT.getSizeInBits() / 128;
13322 int NumLaneElts = NumElts / NumLanes;
13323 int NumHalfLaneElts = NumLaneElts / 2;
13324
13325 bool MatchLo = true, MatchHi = true;
13326 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13327
13328 // Determine UNPCKL/UNPCKH type and operand order.
13329 for (int Elt = 0; Elt != NumElts; ++Elt) {
13330 int M = Mask[Elt];
13331 if (M < 0)
13332 continue;
13333
13334 // Normalize the mask value depending on whether it's V1 or V2.
13335 int NormM = M;
13336 SDValue &Op = Ops[Elt & 1];
13337 if (M < NumElts && (Op.isUndef() || Op == V1))
13338 Op = V1;
13339 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
13340 Op = V2;
13341 NormM -= NumElts;
13342 } else
13343 return SDValue();
13344
13345 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
13346 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
13347 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
13348 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
13349 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
13350 if (MatchLoAnyLane || MatchHiAnyLane) {
13351 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__
__PRETTY_FUNCTION__))
13352 "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__
__PRETTY_FUNCTION__))
;
13353 break;
13354 }
13355 }
13356 MatchLo &= MatchLoAnyLane;
13357 MatchHi &= MatchHiAnyLane;
13358 if (!MatchLo && !MatchHi)
13359 return SDValue();
13360 }
13361 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13361, __extension__
__PRETTY_FUNCTION__))
;
13362
13363 // Element indices have changed after unpacking. Calculate permute mask
13364 // so that they will be put back to the position as dictated by the
13365 // original shuffle mask indices.
13366 SmallVector<int, 32> PermuteMask(NumElts, -1);
13367 for (int Elt = 0; Elt != NumElts; ++Elt) {
13368 int M = Mask[Elt];
13369 if (M < 0)
13370 continue;
13371 int NormM = M;
13372 if (NumElts <= M)
13373 NormM -= NumElts;
13374 bool IsFirstOp = M < NumElts;
13375 int BaseMaskElt =
13376 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
13377 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
13378 PermuteMask[Elt] = BaseMaskElt;
13379 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
13380 PermuteMask[Elt] = BaseMaskElt + 1;
13381 assert(PermuteMask[Elt] != -1 &&(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"
) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__
__PRETTY_FUNCTION__))
13382 "Input mask element is defined but failed to assign permute mask")(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"
) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__
__PRETTY_FUNCTION__))
;
13383 }
13384
13385 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
13386 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
13387 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
13388}
13389
13390/// Try to lower a shuffle as a permute of the inputs followed by an
13391/// UNPCK instruction.
13392///
13393/// This specifically targets cases where we end up with alternating between
13394/// the two inputs, and so can permute them into something that feeds a single
13395/// UNPCK instruction. Note that this routine only targets integer vectors
13396/// because for floating point vectors we have a generalized SHUFPS lowering
13397/// strategy that handles everything that doesn't *exactly* match an unpack,
13398/// making this clever lowering unnecessary.
13399static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
13400 SDValue V1, SDValue V2,
13401 ArrayRef<int> Mask,
13402 const X86Subtarget &Subtarget,
13403 SelectionDAG &DAG) {
13404 int Size = Mask.size();
13405 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13405, __extension__
__PRETTY_FUNCTION__))
;
13406
13407 // This routine only supports 128-bit integer dual input vectors.
13408 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
13409 return SDValue();
13410
13411 int NumLoInputs =
13412 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13413 int NumHiInputs =
13414 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13415
13416 bool UnpackLo = NumLoInputs >= NumHiInputs;
13417
13418 auto TryUnpack = [&](int ScalarSize, int Scale) {
13419 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13420 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13421
13422 for (int i = 0; i < Size; ++i) {
13423 if (Mask[i] < 0)
13424 continue;
13425
13426 // Each element of the unpack contains Scale elements from this mask.
13427 int UnpackIdx = i / Scale;
13428
13429 // We only handle the case where V1 feeds the first slots of the unpack.
13430 // We rely on canonicalization to ensure this is the case.
13431 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13432 return SDValue();
13433
13434 // Setup the mask for this input. The indexing is tricky as we have to
13435 // handle the unpack stride.
13436 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13437 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13438 Mask[i] % Size;
13439 }
13440
13441 // If we will have to shuffle both inputs to use the unpack, check whether
13442 // we can just unpack first and shuffle the result. If so, skip this unpack.
13443 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13444 !isNoopShuffleMask(V2Mask))
13445 return SDValue();
13446
13447 // Shuffle the inputs into place.
13448 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13449 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13450
13451 // Cast the inputs to the type we will use to unpack them.
13452 MVT UnpackVT =
13453 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13454 V1 = DAG.getBitcast(UnpackVT, V1);
13455 V2 = DAG.getBitcast(UnpackVT, V2);
13456
13457 // Unpack the inputs and cast the result back to the desired type.
13458 return DAG.getBitcast(
13459 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13460 UnpackVT, V1, V2));
13461 };
13462
13463 // We try each unpack from the largest to the smallest to try and find one
13464 // that fits this mask.
13465 int OrigScalarSize = VT.getScalarSizeInBits();
13466 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13467 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13468 return Unpack;
13469
13470 // If we're shuffling with a zero vector then we're better off not doing
13471 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13472 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13473 ISD::isBuildVectorAllZeros(V2.getNode()))
13474 return SDValue();
13475
13476 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13477 // initial unpack.
13478 if (NumLoInputs == 0 || NumHiInputs == 0) {
13479 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__
__PRETTY_FUNCTION__))
13480 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__
__PRETTY_FUNCTION__))
;
13481 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13482
13483 // FIXME: We could consider the total complexity of the permute of each
13484 // possible unpacking. Or at the least we should consider how many
13485 // half-crossings are created.
13486 // FIXME: We could consider commuting the unpacks.
13487
13488 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13489 for (int i = 0; i < Size; ++i) {
13490 if (Mask[i] < 0)
13491 continue;
13492
13493 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13493, __extension__
__PRETTY_FUNCTION__))
;
13494
13495 PermMask[i] =
13496 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13497 }
13498 return DAG.getVectorShuffle(
13499 VT, DL,
13500 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
13501 V1, V2),
13502 DAG.getUNDEF(VT), PermMask);
13503 }
13504
13505 return SDValue();
13506}
13507
13508/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
13509/// permuting the elements of the result in place.
13510static SDValue lowerShuffleAsByteRotateAndPermute(
13511 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13512 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13513 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
13514 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
13515 (VT.is512BitVector() && !Subtarget.hasBWI()))
13516 return SDValue();
13517
13518 // We don't currently support lane crossing permutes.
13519 if (is128BitLaneCrossingShuffleMask(VT, Mask))
13520 return SDValue();
13521
13522 int Scale = VT.getScalarSizeInBits() / 8;
13523 int NumLanes = VT.getSizeInBits() / 128;
13524 int NumElts = VT.getVectorNumElements();
13525 int NumEltsPerLane = NumElts / NumLanes;
13526
13527 // Determine range of mask elts.
13528 bool Blend1 = true;
13529 bool Blend2 = true;
13530 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13531 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13532 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13533 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13534 int M = Mask[Lane + Elt];
13535 if (M < 0)
13536 continue;
13537 if (M < NumElts) {
13538 Blend1 &= (M == (Lane + Elt));
13539 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13539, __extension__
__PRETTY_FUNCTION__))
;
13540 M = M % NumEltsPerLane;
13541 Range1.first = std::min(Range1.first, M);
13542 Range1.second = std::max(Range1.second, M);
13543 } else {
13544 M -= NumElts;
13545 Blend2 &= (M == (Lane + Elt));
13546 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13546, __extension__
__PRETTY_FUNCTION__))
;
13547 M = M % NumEltsPerLane;
13548 Range2.first = std::min(Range2.first, M);
13549 Range2.second = std::max(Range2.second, M);
13550 }
13551 }
13552 }
13553
13554 // Bail if we don't need both elements.
13555 // TODO - it might be worth doing this for unary shuffles if the permute
13556 // can be widened.
13557 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
13558 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
13559 return SDValue();
13560
13561 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
13562 return SDValue();
13563
13564 // Rotate the 2 ops so we can access both ranges, then permute the result.
13565 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
13566 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13567 SDValue Rotate = DAG.getBitcast(
13568 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
13569 DAG.getBitcast(ByteVT, Lo),
13570 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
13571 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
13572 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13573 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13574 int M = Mask[Lane + Elt];
13575 if (M < 0)
13576 continue;
13577 if (M < NumElts)
13578 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
13579 else
13580 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
13581 }
13582 }
13583 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
13584 };
13585
13586 // Check if the ranges are small enough to rotate from either direction.
13587 if (Range2.second < Range1.first)
13588 return RotateAndPermute(V1, V2, Range1.first, 0);
13589 if (Range1.second < Range2.first)
13590 return RotateAndPermute(V2, V1, Range2.first, NumElts);
13591 return SDValue();
13592}
13593
13594static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
13595 return isUndefOrEqual(Mask, 0);
13596}
13597
13598static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
13599 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
13600}
13601
13602/// Check if the Mask consists of the same element repeated multiple times.
13603static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
13604 size_t NumUndefs = 0;
13605 std::optional<int> UniqueElt;
13606 for (int Elt : Mask) {
13607 if (Elt == SM_SentinelUndef) {
13608 NumUndefs++;
13609 continue;
13610 }
13611 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
13612 return false;
13613 UniqueElt = Elt;
13614 }
13615 // Make sure the element is repeated enough times by checking the number of
13616 // undefs is small.
13617 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
13618}
13619
13620/// Generic routine to decompose a shuffle and blend into independent
13621/// blends and permutes.
13622///
13623/// This matches the extremely common pattern for handling combined
13624/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
13625/// operations. It will try to pick the best arrangement of shuffles and
13626/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
13627static SDValue lowerShuffleAsDecomposedShuffleMerge(
13628 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13629 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13630 int NumElts = Mask.size();
13631 int NumLanes = VT.getSizeInBits() / 128;
13632 int NumEltsPerLane = NumElts / NumLanes;
13633
13634 // Shuffle the input elements into the desired positions in V1 and V2 and
13635 // unpack/blend them together.
13636 bool IsAlternating = true;
13637 SmallVector<int, 32> V1Mask(NumElts, -1);
13638 SmallVector<int, 32> V2Mask(NumElts, -1);
13639 SmallVector<int, 32> FinalMask(NumElts, -1);
13640 for (int i = 0; i < NumElts; ++i) {
13641 int M = Mask[i];
13642 if (M >= 0 && M < NumElts) {
13643 V1Mask[i] = M;
13644 FinalMask[i] = i;
13645 IsAlternating &= (i & 1) == 0;
13646 } else if (M >= NumElts) {
13647 V2Mask[i] = M - NumElts;
13648 FinalMask[i] = i + NumElts;
13649 IsAlternating &= (i & 1) == 1;
13650 }
13651 }
13652
13653 // If we effectively only demand the 0'th element of \p Input, and not only
13654 // as 0'th element, then broadcast said input,
13655 // and change \p InputMask to be a no-op (identity) mask.
13656 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
13657 &DAG](SDValue &Input,
13658 MutableArrayRef<int> InputMask) {
13659 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
13660 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
13661 !X86::mayFoldLoad(Input, Subtarget)))
13662 return;
13663 if (isNoopShuffleMask(InputMask))
13664 return;
13665 assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__
__PRETTY_FUNCTION__))
13666 "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__
__PRETTY_FUNCTION__))
;
13667 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13668 for (auto I : enumerate(InputMask)) {
13669 int &InputMaskElt = I.value();
13670 if (InputMaskElt >= 0)
13671 InputMaskElt = I.index();
13672 }
13673 };
13674
13675 // Currently, we may need to produce one shuffle per input, and blend results.
13676 // It is possible that the shuffle for one of the inputs is already a no-op.
13677 // See if we can simplify non-no-op shuffles into broadcasts,
13678 // which we consider to be strictly better than an arbitrary shuffle.
13679 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13680 isNoopOrBroadcastShuffleMask(V2Mask)) {
13681 canonicalizeBroadcastableInput(V1, V1Mask);
13682 canonicalizeBroadcastableInput(V2, V2Mask);
13683 }
13684
13685 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13686 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13687 // the shuffle may be able to fold with a load or other benefit. However, when
13688 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13689 // pre-shuffle first is a better strategy.
13690 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
13691 // Only prefer immediate blends to unpack/rotate.
13692 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13693 DAG, true))
13694 return BlendPerm;
13695 // If either input vector provides only a single element which is repeated
13696 // multiple times, unpacking from both input vectors would generate worse
13697 // code. e.g. for
13698 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
13699 // it is better to process t4 first to create a vector of t4[0], then unpack
13700 // that vector with t2.
13701 if (!isSingleElementRepeatedMask(V1Mask) &&
13702 !isSingleElementRepeatedMask(V2Mask))
13703 if (SDValue UnpackPerm =
13704 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
13705 return UnpackPerm;
13706 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13707 DL, VT, V1, V2, Mask, Subtarget, DAG))
13708 return RotatePerm;
13709 // Unpack/rotate failed - try again with variable blends.
13710 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13711 DAG))
13712 return BlendPerm;
13713 if (VT.getScalarSizeInBits() >= 32)
13714 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
13715 DL, VT, V1, V2, Mask, Subtarget, DAG))
13716 return PermUnpack;
13717 }
13718
13719 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13720 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13721 // TODO: It doesn't have to be alternating - but each lane mustn't have more
13722 // than half the elements coming from each source.
13723 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13724 V1Mask.assign(NumElts, -1);
13725 V2Mask.assign(NumElts, -1);
13726 FinalMask.assign(NumElts, -1);
13727 for (int i = 0; i != NumElts; i += NumEltsPerLane)
13728 for (int j = 0; j != NumEltsPerLane; ++j) {
13729 int M = Mask[i + j];
13730 if (M >= 0 && M < NumElts) {
13731 V1Mask[i + (j / 2)] = M;
13732 FinalMask[i + j] = i + (j / 2);
13733 } else if (M >= NumElts) {
13734 V2Mask[i + (j / 2)] = M - NumElts;
13735 FinalMask[i + j] = i + (j / 2) + NumElts;
13736 }
13737 }
13738 }
13739
13740 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13741 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13742 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13743}
13744
13745/// Try to lower a vector shuffle as a bit rotation.
13746///
13747/// Look for a repeated rotation pattern in each sub group.
13748/// Returns a ISD::ROTL element rotation amount or -1 if failed.
13749static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13750 int NumElts = Mask.size();
13751 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13751, __extension__
__PRETTY_FUNCTION__))
;
13752
13753 int RotateAmt = -1;
13754 for (int i = 0; i != NumElts; i += NumSubElts) {
13755 for (int j = 0; j != NumSubElts; ++j) {
13756 int M = Mask[i + j];
13757 if (M < 0)
13758 continue;
13759 if (!isInRange(M, i, i + NumSubElts))
13760 return -1;
13761 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13762 if (0 <= RotateAmt && Offset != RotateAmt)
13763 return -1;
13764 RotateAmt = Offset;
13765 }
13766 }
13767 return RotateAmt;
13768}
13769
13770static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13771 const X86Subtarget &Subtarget,
13772 ArrayRef<int> Mask) {
13773 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13773, __extension__
__PRETTY_FUNCTION__))
;
13774 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13774, __extension__
__PRETTY_FUNCTION__))
;
13775
13776 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13777 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13778 int MaxSubElts = 64 / EltSizeInBits;
13779 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13780 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13781 if (RotateAmt < 0)
13782 continue;
13783
13784 int NumElts = Mask.size();
13785 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13786 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13787 return RotateAmt * EltSizeInBits;
13788 }
13789
13790 return -1;
13791}
13792
13793/// Lower shuffle using X86ISD::VROTLI rotations.
13794static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13795 ArrayRef<int> Mask,
13796 const X86Subtarget &Subtarget,
13797 SelectionDAG &DAG) {
13798 // Only XOP + AVX512 targets have bit rotation instructions.
13799 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13800 bool IsLegal =
13801 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13802 if (!IsLegal && Subtarget.hasSSE3())
13803 return SDValue();
13804
13805 MVT RotateVT;
13806 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13807 Subtarget, Mask);
13808 if (RotateAmt < 0)
13809 return SDValue();
13810
13811 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13812 // expanded to OR(SRL,SHL), will be more efficient, but if they can
13813 // widen to vXi16 or more then existing lowering should will be better.
13814 if (!IsLegal) {
13815 if ((RotateAmt % 16) == 0)
13816 return SDValue();
13817 // TODO: Use getTargetVShiftByConstNode.
13818 unsigned ShlAmt = RotateAmt;
13819 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13820 V1 = DAG.getBitcast(RotateVT, V1);
13821 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13822 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13823 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13824 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13825 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13826 return DAG.getBitcast(VT, Rot);
13827 }
13828
13829 SDValue Rot =
13830 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13831 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13832 return DAG.getBitcast(VT, Rot);
13833}
13834
13835/// Try to match a vector shuffle as an element rotation.
13836///
13837/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13838static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13839 ArrayRef<int> Mask) {
13840 int NumElts = Mask.size();
13841
13842 // We need to detect various ways of spelling a rotation:
13843 // [11, 12, 13, 14, 15, 0, 1, 2]
13844 // [-1, 12, 13, 14, -1, -1, 1, -1]
13845 // [-1, -1, -1, -1, -1, -1, 1, 2]
13846 // [ 3, 4, 5, 6, 7, 8, 9, 10]
13847 // [-1, 4, 5, 6, -1, -1, 9, -1]
13848 // [-1, 4, 5, 6, -1, -1, -1, -1]
13849 int Rotation = 0;
13850 SDValue Lo, Hi;
13851 for (int i = 0; i < NumElts; ++i) {
13852 int M = Mask[i];
13853 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__
__PRETTY_FUNCTION__))
13854 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__
__PRETTY_FUNCTION__))
;
13855 if (M < 0)
13856 continue;
13857
13858 // Determine where a rotated vector would have started.
13859 int StartIdx = i - (M % NumElts);
13860 if (StartIdx == 0)
13861 // The identity rotation isn't interesting, stop.
13862 return -1;
13863
13864 // If we found the tail of a vector the rotation must be the missing
13865 // front. If we found the head of a vector, it must be how much of the
13866 // head.
13867 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13868
13869 if (Rotation == 0)
13870 Rotation = CandidateRotation;
13871 else if (Rotation != CandidateRotation)
13872 // The rotations don't match, so we can't match this mask.
13873 return -1;
13874
13875 // Compute which value this mask is pointing at.
13876 SDValue MaskV = M < NumElts ? V1 : V2;
13877
13878 // Compute which of the two target values this index should be assigned
13879 // to. This reflects whether the high elements are remaining or the low
13880 // elements are remaining.
13881 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13882
13883 // Either set up this value if we've not encountered it before, or check
13884 // that it remains consistent.
13885 if (!TargetV)
13886 TargetV = MaskV;
13887 else if (TargetV != MaskV)
13888 // This may be a rotation, but it pulls from the inputs in some
13889 // unsupported interleaving.
13890 return -1;
13891 }
13892
13893 // Check that we successfully analyzed the mask, and normalize the results.
13894 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13894, __extension__
__PRETTY_FUNCTION__))
;
13895 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__
__PRETTY_FUNCTION__))
;
13896 if (!Lo)
13897 Lo = Hi;
13898 else if (!Hi)
13899 Hi = Lo;
13900
13901 V1 = Lo;
13902 V2 = Hi;
13903
13904 return Rotation;
13905}
13906
13907/// Try to lower a vector shuffle as a byte rotation.
13908///
13909/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13910/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13911/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13912/// try to generically lower a vector shuffle through such an pattern. It
13913/// does not check for the profitability of lowering either as PALIGNR or
13914/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13915/// This matches shuffle vectors that look like:
13916///
13917/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13918///
13919/// Essentially it concatenates V1 and V2, shifts right by some number of
13920/// elements, and takes the low elements as the result. Note that while this is
13921/// specified as a *right shift* because x86 is little-endian, it is a *left
13922/// rotate* of the vector lanes.
13923static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13924 ArrayRef<int> Mask) {
13925 // Don't accept any shuffles with zero elements.
13926 if (isAnyZero(Mask))
13927 return -1;
13928
13929 // PALIGNR works on 128-bit lanes.
13930 SmallVector<int, 16> RepeatedMask;
13931 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13932 return -1;
13933
13934 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13935 if (Rotation <= 0)
13936 return -1;
13937
13938 // PALIGNR rotates bytes, so we need to scale the
13939 // rotation based on how many bytes are in the vector lane.
13940 int NumElts = RepeatedMask.size();
13941 int Scale = 16 / NumElts;
13942 return Rotation * Scale;
13943}
13944
13945static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13946 SDValue V2, ArrayRef<int> Mask,
13947 const X86Subtarget &Subtarget,
13948 SelectionDAG &DAG) {
13949 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13949, __extension__
__PRETTY_FUNCTION__))
;
13950
13951 SDValue Lo = V1, Hi = V2;
13952 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13953 if (ByteRotation <= 0)
13954 return SDValue();
13955
13956 // Cast the inputs to i8 vector of correct length to match PALIGNR or
13957 // PSLLDQ/PSRLDQ.
13958 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13959 Lo = DAG.getBitcast(ByteVT, Lo);
13960 Hi = DAG.getBitcast(ByteVT, Hi);
13961
13962 // SSSE3 targets can use the palignr instruction.
13963 if (Subtarget.hasSSSE3()) {
13964 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))
13965 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))
;
13966 return DAG.getBitcast(
13967 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13968 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13969 }
13970
13971 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__
__PRETTY_FUNCTION__))
13972 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__
__PRETTY_FUNCTION__))
;
13973 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__
__PRETTY_FUNCTION__))
13974 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__
__PRETTY_FUNCTION__))
;
13975 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__
__PRETTY_FUNCTION__))
13976 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__
__PRETTY_FUNCTION__))
;
13977
13978 // Default SSE2 implementation
13979 int LoByteShift = 16 - ByteRotation;
13980 int HiByteShift = ByteRotation;
13981
13982 SDValue LoShift =
13983 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13984 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13985 SDValue HiShift =
13986 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13987 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13988 return DAG.getBitcast(VT,
13989 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13990}
13991
13992/// Try to lower a vector shuffle as a dword/qword rotation.
13993///
13994/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13995/// rotation of the concatenation of two vectors; This routine will
13996/// try to generically lower a vector shuffle through such an pattern.
13997///
13998/// Essentially it concatenates V1 and V2, shifts right by some number of
13999/// elements, and takes the low elements as the result. Note that while this is
14000/// specified as a *right shift* because x86 is little-endian, it is a *left
14001/// rotate* of the vector lanes.
14002static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
14003 SDValue V2, ArrayRef<int> Mask,
14004 const X86Subtarget &Subtarget,
14005 SelectionDAG &DAG) {
14006 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__
__PRETTY_FUNCTION__))
14007 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__
__PRETTY_FUNCTION__))
;
14008
14009 // 128/256-bit vectors are only supported with VLX.
14010 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__
__PRETTY_FUNCTION__))
14011 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__
__PRETTY_FUNCTION__))
;
14012
14013 SDValue Lo = V1, Hi = V2;
14014 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
14015 if (Rotation <= 0)
14016 return SDValue();
14017
14018 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
14019 DAG.getTargetConstant(Rotation, DL, MVT::i8));
14020}
14021
14022/// Try to lower a vector shuffle as a byte shift sequence.
14023static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
14024 SDValue V2, ArrayRef<int> Mask,
14025 const APInt &Zeroable,
14026 const X86Subtarget &Subtarget,
14027 SelectionDAG &DAG) {
14028 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14028, __extension__
__PRETTY_FUNCTION__))
;
14029 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14029, __extension__
__PRETTY_FUNCTION__))
;
14030
14031 // We need a shuffle that has zeros at one/both ends and a sequential
14032 // shuffle from one source within.
14033 unsigned ZeroLo = Zeroable.countr_one();
14034 unsigned ZeroHi = Zeroable.countl_one();
14035 if (!ZeroLo && !ZeroHi)
14036 return SDValue();
14037
14038 unsigned NumElts = Mask.size();
14039 unsigned Len = NumElts - (ZeroLo + ZeroHi);
14040 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
14041 return SDValue();
14042
14043 unsigned Scale = VT.getScalarSizeInBits() / 8;
14044 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
14045 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
14046 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
14047 return SDValue();
14048
14049 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
14050 Res = DAG.getBitcast(MVT::v16i8, Res);
14051
14052 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
14053 // inner sequential set of elements, possibly offset:
14054 // 01234567 --> zzzzzz01 --> 1zzzzzzz
14055 // 01234567 --> 4567zzzz --> zzzzz456
14056 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
14057 if (ZeroLo == 0) {
14058 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14059 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14060 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14061 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14062 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
14063 } else if (ZeroHi == 0) {
14064 unsigned Shift = Mask[ZeroLo] % NumElts;
14065 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14066 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14067 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14068 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14069 } else if (!Subtarget.hasSSSE3()) {
14070 // If we don't have PSHUFB then its worth avoiding an AND constant mask
14071 // by performing 3 byte shifts. Shuffle combining can kick in above that.
14072 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
14073 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14074 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14075 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14076 Shift += Mask[ZeroLo] % NumElts;
14077 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14078 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14079 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14080 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14081 } else
14082 return SDValue();
14083
14084 return DAG.getBitcast(VT, Res);
14085}
14086
14087/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
14088///
14089/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
14090/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
14091/// matches elements from one of the input vectors shuffled to the left or
14092/// right with zeroable elements 'shifted in'. It handles both the strictly
14093/// bit-wise element shifts and the byte shift across an entire 128-bit double
14094/// quad word lane.
14095///
14096/// PSHL : (little-endian) left bit shift.
14097/// [ zz, 0, zz, 2 ]
14098/// [ -1, 4, zz, -1 ]
14099/// PSRL : (little-endian) right bit shift.
14100/// [ 1, zz, 3, zz]
14101/// [ -1, -1, 7, zz]
14102/// PSLLDQ : (little-endian) left byte shift
14103/// [ zz, 0, 1, 2, 3, 4, 5, 6]
14104/// [ zz, zz, -1, -1, 2, 3, 4, -1]
14105/// [ zz, zz, zz, zz, zz, zz, -1, 1]
14106/// PSRLDQ : (little-endian) right byte shift
14107/// [ 5, 6, 7, zz, zz, zz, zz, zz]
14108/// [ -1, 5, 6, 7, zz, zz, zz, zz]
14109/// [ 1, 2, -1, -1, -1, -1, zz, zz]
14110static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
14111 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
14112 int MaskOffset, const APInt &Zeroable,
14113 const X86Subtarget &Subtarget) {
14114 int Size = Mask.size();
14115 unsigned SizeInBits = Size * ScalarSizeInBits;
14116
14117 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
14118 for (int i = 0; i < Size; i += Scale)
14119 for (int j = 0; j < Shift; ++j)
14120 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
14121 return false;
14122
14123 return true;
14124 };
14125
14126 auto MatchShift = [&](int Shift, int Scale, bool Left) {
14127 for (int i = 0; i != Size; i += Scale) {
14128 unsigned Pos = Left ? i + Shift : i;
14129 unsigned Low = Left ? i : i + Shift;
14130 unsigned Len = Scale - Shift;
14131 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
14132 return -1;
14133 }
14134
14135 int ShiftEltBits = ScalarSizeInBits * Scale;
14136 bool ByteShift = ShiftEltBits > 64;
14137 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
14138 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
14139 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
14140
14141 // Normalize the scale for byte shifts to still produce an i64 element
14142 // type.
14143 Scale = ByteShift ? Scale / 2 : Scale;
14144
14145 // We need to round trip through the appropriate type for the shift.
14146 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
14147 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
14148 : MVT::getVectorVT(ShiftSVT, Size / Scale);
14149 return (int)ShiftAmt;
14150 };
14151
14152 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
14153 // keep doubling the size of the integer elements up to that. We can
14154 // then shift the elements of the integer vector by whole multiples of
14155 // their width within the elements of the larger integer vector. Test each
14156 // multiple to see if we can find a match with the moved element indices
14157 // and that the shifted in elements are all zeroable.
14158 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
14159 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
14160 for (int Shift = 1; Shift != Scale; ++Shift)
14161 for (bool Left : {true, false})
14162 if (CheckZeros(Shift, Scale, Left)) {
14163 int ShiftAmt = MatchShift(Shift, Scale, Left);
14164 if (0 < ShiftAmt)
14165 return ShiftAmt;
14166 }
14167
14168 // no match
14169 return -1;
14170}
14171
14172static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
14173 SDValue V2, ArrayRef<int> Mask,
14174 const APInt &Zeroable,
14175 const X86Subtarget &Subtarget,
14176 SelectionDAG &DAG, bool BitwiseOnly) {
14177 int Size = Mask.size();
14178 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14178, __extension__
__PRETTY_FUNCTION__))
;
14179
14180 MVT ShiftVT;
14181 SDValue V = V1;
14182 unsigned Opcode;
14183
14184 // Try to match shuffle against V1 shift.
14185 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14186 Mask, 0, Zeroable, Subtarget);
14187
14188 // If V1 failed, try to match shuffle against V2 shift.
14189 if (ShiftAmt < 0) {
14190 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14191 Mask, Size, Zeroable, Subtarget);
14192 V = V2;
14193 }
14194
14195 if (ShiftAmt < 0)
14196 return SDValue();
14197
14198 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
14199 return SDValue();
14200
14201 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__
__PRETTY_FUNCTION__))
14202 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__
__PRETTY_FUNCTION__))
;
14203 V = DAG.getBitcast(ShiftVT, V);
14204 V = DAG.getNode(Opcode, DL, ShiftVT, V,
14205 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
14206 return DAG.getBitcast(VT, V);
14207}
14208
14209// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
14210// Remainder of lower half result is zero and upper half is all undef.
14211static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
14212 ArrayRef<int> Mask, uint64_t &BitLen,
14213 uint64_t &BitIdx, const APInt &Zeroable) {
14214 int Size = Mask.size();
14215 int HalfSize = Size / 2;
14216 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14216, __extension__
__PRETTY_FUNCTION__))
;
14217 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14217, __extension__
__PRETTY_FUNCTION__))
;
14218
14219 // Upper half must be undefined.
14220 if (!isUndefUpperHalf(Mask))
14221 return false;
14222
14223 // Determine the extraction length from the part of the
14224 // lower half that isn't zeroable.
14225 int Len = HalfSize;
14226 for (; Len > 0; --Len)
14227 if (!Zeroable[Len - 1])
14228 break;
14229 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14229, __extension__
__PRETTY_FUNCTION__))
;
14230
14231 // Attempt to match first Len sequential elements from the lower half.
14232 SDValue Src;
14233 int Idx = -1;
14234 for (int i = 0; i != Len; ++i) {
14235 int M = Mask[i];
14236 if (M == SM_SentinelUndef)
14237 continue;
14238 SDValue &V = (M < Size ? V1 : V2);
14239 M = M % Size;
14240
14241 // The extracted elements must start at a valid index and all mask
14242 // elements must be in the lower half.
14243 if (i > M || M >= HalfSize)
14244 return false;
14245
14246 if (Idx < 0 || (Src == V && Idx == (M - i))) {
14247 Src = V;
14248 Idx = M - i;
14249 continue;
14250 }
14251 return false;
14252 }
14253
14254 if (!Src || Idx < 0)
14255 return false;
14256
14257 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14257, __extension__
__PRETTY_FUNCTION__))
;
14258 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14259 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14260 V1 = Src;
14261 return true;
14262}
14263
14264// INSERTQ: Extract lowest Len elements from lower half of second source and
14265// insert over first source, starting at Idx.
14266// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
14267static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
14268 ArrayRef<int> Mask, uint64_t &BitLen,
14269 uint64_t &BitIdx) {
14270 int Size = Mask.size();
14271 int HalfSize = Size / 2;
14272 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14272, __extension__
__PRETTY_FUNCTION__))
;
14273
14274 // Upper half must be undefined.
14275 if (!isUndefUpperHalf(Mask))
14276 return false;
14277
14278 for (int Idx = 0; Idx != HalfSize; ++Idx) {
14279 SDValue Base;
14280
14281 // Attempt to match first source from mask before insertion point.
14282 if (isUndefInRange(Mask, 0, Idx)) {
14283 /* EMPTY */
14284 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
14285 Base = V1;
14286 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
14287 Base = V2;
14288 } else {
14289 continue;
14290 }
14291
14292 // Extend the extraction length looking to match both the insertion of
14293 // the second source and the remaining elements of the first.
14294 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
14295 SDValue Insert;
14296 int Len = Hi - Idx;
14297
14298 // Match insertion.
14299 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
14300 Insert = V1;
14301 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
14302 Insert = V2;
14303 } else {
14304 continue;
14305 }
14306
14307 // Match the remaining elements of the lower half.
14308 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
14309 /* EMPTY */
14310 } else if ((!Base || (Base == V1)) &&
14311 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
14312 Base = V1;
14313 } else if ((!Base || (Base == V2)) &&
14314 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
14315 Size + Hi)) {
14316 Base = V2;
14317 } else {
14318 continue;
14319 }
14320
14321 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14322 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14323 V1 = Base;
14324 V2 = Insert;
14325 return true;
14326 }
14327 }
14328
14329 return false;
14330}
14331
14332/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
14333static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
14334 SDValue V2, ArrayRef<int> Mask,
14335 const APInt &Zeroable, SelectionDAG &DAG) {
14336 uint64_t BitLen, BitIdx;
14337 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
14338 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
14339 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14340 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14341
14342 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
14343 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
14344 V2 ? V2 : DAG.getUNDEF(VT),
14345 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14346 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14347
14348 return SDValue();
14349}
14350
14351/// Lower a vector shuffle as a zero or any extension.
14352///
14353/// Given a specific number of elements, element bit width, and extension
14354/// stride, produce either a zero or any extension based on the available
14355/// features of the subtarget. The extended elements are consecutive and
14356/// begin and can start from an offsetted element index in the input; to
14357/// avoid excess shuffling the offset must either being in the bottom lane
14358/// or at the start of a higher lane. All extended elements must be from
14359/// the same lane.
14360static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
14361 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
14362 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14363 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__
__PRETTY_FUNCTION__))
;
14364 int EltBits = VT.getScalarSizeInBits();
14365 int NumElements = VT.getVectorNumElements();
14366 int NumEltsPerLane = 128 / EltBits;
14367 int OffsetLane = Offset / NumEltsPerLane;
14368 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__
__PRETTY_FUNCTION__))
14369 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__
__PRETTY_FUNCTION__))
;
14370 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14370, __extension__
__PRETTY_FUNCTION__))
;
14371 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14371, __extension__
__PRETTY_FUNCTION__))
;
14372 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__
__PRETTY_FUNCTION__))
14373 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__
__PRETTY_FUNCTION__))
;
14374
14375 // Check that an index is in same lane as the base offset.
14376 auto SafeOffset = [&](int Idx) {
14377 return OffsetLane == (Idx / NumEltsPerLane);
14378 };
14379
14380 // Shift along an input so that the offset base moves to the first element.
14381 auto ShuffleOffset = [&](SDValue V) {
14382 if (!Offset)
14383 return V;
14384
14385 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14386 for (int i = 0; i * Scale < NumElements; ++i) {
14387 int SrcIdx = i + Offset;
14388 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
14389 }
14390 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
14391 };
14392
14393 // Found a valid a/zext mask! Try various lowering strategies based on the
14394 // input type and available ISA extensions.
14395 if (Subtarget.hasSSE41()) {
14396 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
14397 // PUNPCK will catch this in a later shuffle match.
14398 if (Offset && Scale == 2 && VT.is128BitVector())
14399 return SDValue();
14400 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
14401 NumElements / Scale);
14402 InputV = DAG.getBitcast(VT, InputV);
14403 InputV = ShuffleOffset(InputV);
14404 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
14405 DL, ExtVT, InputV, DAG);
14406 return DAG.getBitcast(VT, InputV);
14407 }
14408
14409 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14409, __extension__
__PRETTY_FUNCTION__))
;
14410 InputV = DAG.getBitcast(VT, InputV);
14411
14412 // For any extends we can cheat for larger element sizes and use shuffle
14413 // instructions that can fold with a load and/or copy.
14414 if (AnyExt && EltBits == 32) {
14415 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
14416 -1};
14417 return DAG.getBitcast(
14418 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14419 DAG.getBitcast(MVT::v4i32, InputV),
14420 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14421 }
14422 if (AnyExt && EltBits == 16 && Scale > 2) {
14423 int PSHUFDMask[4] = {Offset / 2, -1,
14424 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
14425 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14426 DAG.getBitcast(MVT::v4i32, InputV),
14427 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14428 int PSHUFWMask[4] = {1, -1, -1, -1};
14429 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
14430 return DAG.getBitcast(
14431 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
14432 DAG.getBitcast(MVT::v8i16, InputV),
14433 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
14434 }
14435
14436 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
14437 // to 64-bits.
14438 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
14439 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14439, __extension__
__PRETTY_FUNCTION__))
;
14440 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14440, __extension__
__PRETTY_FUNCTION__))
;
14441
14442 int LoIdx = Offset * EltBits;
14443 SDValue Lo = DAG.getBitcast(
14444 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14445 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14446 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
14447
14448 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
14449 return DAG.getBitcast(VT, Lo);
14450
14451 int HiIdx = (Offset + 1) * EltBits;
14452 SDValue Hi = DAG.getBitcast(
14453 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14454 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14455 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
14456 return DAG.getBitcast(VT,
14457 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
14458 }
14459
14460 // If this would require more than 2 unpack instructions to expand, use
14461 // pshufb when available. We can only use more than 2 unpack instructions
14462 // when zero extending i8 elements which also makes it easier to use pshufb.
14463 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
14464 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14464, __extension__
__PRETTY_FUNCTION__))
;
14465 SDValue PSHUFBMask[16];
14466 for (int i = 0; i < 16; ++i) {
14467 int Idx = Offset + (i / Scale);
14468 if ((i % Scale == 0 && SafeOffset(Idx))) {
14469 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
14470 continue;
14471 }
14472 PSHUFBMask[i] =
14473 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
14474 }
14475 InputV = DAG.getBitcast(MVT::v16i8, InputV);
14476 return DAG.getBitcast(
14477 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
14478 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
14479 }
14480
14481 // If we are extending from an offset, ensure we start on a boundary that
14482 // we can unpack from.
14483 int AlignToUnpack = Offset % (NumElements / Scale);
14484 if (AlignToUnpack) {
14485 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14486 for (int i = AlignToUnpack; i < NumElements; ++i)
14487 ShMask[i - AlignToUnpack] = i;
14488 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
14489 Offset -= AlignToUnpack;
14490 }
14491
14492 // Otherwise emit a sequence of unpacks.
14493 do {
14494 unsigned UnpackLoHi = X86ISD::UNPCKL;
14495 if (Offset >= (NumElements / 2)) {
14496 UnpackLoHi = X86ISD::UNPCKH;
14497 Offset -= (NumElements / 2);
14498 }
14499
14500 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
14501 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
14502 : getZeroVector(InputVT, Subtarget, DAG, DL);
14503 InputV = DAG.getBitcast(InputVT, InputV);
14504 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
14505 Scale /= 2;
14506 EltBits *= 2;
14507 NumElements /= 2;
14508 } while (Scale > 1);
14509 return DAG.getBitcast(VT, InputV);
14510}
14511
14512/// Try to lower a vector shuffle as a zero extension on any microarch.
14513///
14514/// This routine will try to do everything in its power to cleverly lower
14515/// a shuffle which happens to match the pattern of a zero extend. It doesn't
14516/// check for the profitability of this lowering, it tries to aggressively
14517/// match this pattern. It will use all of the micro-architectural details it
14518/// can to emit an efficient lowering. It handles both blends with all-zero
14519/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
14520/// masking out later).
14521///
14522/// The reason we have dedicated lowering for zext-style shuffles is that they
14523/// are both incredibly common and often quite performance sensitive.
14524static SDValue lowerShuffleAsZeroOrAnyExtend(
14525 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14526 const APInt &Zeroable, const X86Subtarget &Subtarget,
14527 SelectionDAG &DAG) {
14528 int Bits = VT.getSizeInBits();
14529 int NumLanes = Bits / 128;
14530 int NumElements = VT.getVectorNumElements();
14531 int NumEltsPerLane = NumElements / NumLanes;
14532 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__
__PRETTY_FUNCTION__))
14533 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__
__PRETTY_FUNCTION__))
;
14534 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14534, __extension__
__PRETTY_FUNCTION__))
;
14535
14536 // Define a helper function to check a particular ext-scale and lower to it if
14537 // valid.
14538 auto Lower = [&](int Scale) -> SDValue {
14539 SDValue InputV;
14540 bool AnyExt = true;
14541 int Offset = 0;
14542 int Matches = 0;
14543 for (int i = 0; i < NumElements; ++i) {
14544 int M = Mask[i];
14545 if (M < 0)
14546 continue; // Valid anywhere but doesn't tell us anything.
14547 if (i % Scale != 0) {
14548 // Each of the extended elements need to be zeroable.
14549 if (!Zeroable[i])
14550 return SDValue();
14551
14552 // We no longer are in the anyext case.
14553 AnyExt = false;
14554 continue;
14555 }
14556
14557 // Each of the base elements needs to be consecutive indices into the
14558 // same input vector.
14559 SDValue V = M < NumElements ? V1 : V2;
14560 M = M % NumElements;
14561 if (!InputV) {
14562 InputV = V;
14563 Offset = M - (i / Scale);
14564 } else if (InputV != V)
14565 return SDValue(); // Flip-flopping inputs.
14566
14567 // Offset must start in the lowest 128-bit lane or at the start of an
14568 // upper lane.
14569 // FIXME: Is it ever worth allowing a negative base offset?
14570 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
14571 (Offset % NumEltsPerLane) == 0))
14572 return SDValue();
14573
14574 // If we are offsetting, all referenced entries must come from the same
14575 // lane.
14576 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
14577 return SDValue();
14578
14579 if ((M % NumElements) != (Offset + (i / Scale)))
14580 return SDValue(); // Non-consecutive strided elements.
14581 Matches++;
14582 }
14583
14584 // If we fail to find an input, we have a zero-shuffle which should always
14585 // have already been handled.
14586 // FIXME: Maybe handle this here in case during blending we end up with one?
14587 if (!InputV)
14588 return SDValue();
14589
14590 // If we are offsetting, don't extend if we only match a single input, we
14591 // can always do better by using a basic PSHUF or PUNPCK.
14592 if (Offset != 0 && Matches < 2)
14593 return SDValue();
14594
14595 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
14596 InputV, Mask, Subtarget, DAG);
14597 };
14598
14599 // The widest scale possible for extending is to a 64-bit integer.
14600 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__
__PRETTY_FUNCTION__))
14601 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__
__PRETTY_FUNCTION__))
;
14602 int NumExtElements = Bits / 64;
14603
14604 // Each iteration, try extending the elements half as much, but into twice as
14605 // many elements.
14606 for (; NumExtElements < NumElements; NumExtElements *= 2) {
14607 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__
__PRETTY_FUNCTION__))
14608 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__
__PRETTY_FUNCTION__))
;
14609 if (SDValue V = Lower(NumElements / NumExtElements))
14610 return V;
14611 }
14612
14613 // General extends failed, but 128-bit vectors may be able to use MOVQ.
14614 if (Bits != 128)
14615 return SDValue();
14616
14617 // Returns one of the source operands if the shuffle can be reduced to a
14618 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
14619 auto CanZExtLowHalf = [&]() {
14620 for (int i = NumElements / 2; i != NumElements; ++i)
14621 if (!Zeroable[i])
14622 return SDValue();
14623 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
14624 return V1;
14625 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
14626 return V2;
14627 return SDValue();
14628 };
14629
14630 if (SDValue V = CanZExtLowHalf()) {
14631 V = DAG.getBitcast(MVT::v2i64, V);
14632 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
14633 return DAG.getBitcast(VT, V);
14634 }
14635
14636 // No viable ext lowering found.
14637 return SDValue();
14638}
14639
14640/// Try to get a scalar value for a specific element of a vector.
14641///
14642/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
14643static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
14644 SelectionDAG &DAG) {
14645 MVT VT = V.getSimpleValueType();
14646 MVT EltVT = VT.getVectorElementType();
14647 V = peekThroughBitcasts(V);
14648
14649 // If the bitcasts shift the element size, we can't extract an equivalent
14650 // element from it.
14651 MVT NewVT = V.getSimpleValueType();
14652 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
14653 return SDValue();
14654
14655 if (V.getOpcode() == ISD::BUILD_VECTOR ||
14656 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
14657 // Ensure the scalar operand is the same size as the destination.
14658 // FIXME: Add support for scalar truncation where possible.
14659 SDValue S = V.getOperand(Idx);
14660 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
14661 return DAG.getBitcast(EltVT, S);
14662 }
14663
14664 return SDValue();
14665}
14666
14667/// Helper to test for a load that can be folded with x86 shuffles.
14668///
14669/// This is particularly important because the set of instructions varies
14670/// significantly based on whether the operand is a load or not.
14671static bool isShuffleFoldableLoad(SDValue V) {
14672 return V->hasOneUse() &&
14673 ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
14674}
14675
14676template<typename T>
14677static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
14678 return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
14679}
14680
14681template<typename T>
14682bool X86TargetLowering::isSoftFP16(T VT) const {
14683 return ::isSoftFP16(VT, Subtarget);
14684}
14685
14686/// Try to lower insertion of a single element into a zero vector.
14687///
14688/// This is a common pattern that we have especially efficient patterns to lower
14689/// across all subtarget feature sets.
14690static SDValue lowerShuffleAsElementInsertion(
14691 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14692 const APInt &Zeroable, const X86Subtarget &Subtarget,
14693 SelectionDAG &DAG) {
14694 MVT ExtVT = VT;
14695 MVT EltVT = VT.getVectorElementType();
14696 unsigned NumElts = VT.getVectorNumElements();
14697 unsigned EltBits = VT.getScalarSizeInBits();
14698
14699 if (isSoftFP16(EltVT, Subtarget))
14700 return SDValue();
14701
14702 int V2Index =
14703 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14704 Mask.begin();
14705 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
14706 bool IsV1Zeroable = true;
14707 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14708 if (i != V2Index && !Zeroable[i]) {
14709 IsV1Zeroable = false;
14710 break;
14711 }
14712
14713 // Bail if a non-zero V1 isn't used in place.
14714 if (!IsV1Zeroable) {
14715 SmallVector<int, 8> V1Mask(Mask);
14716 V1Mask[V2Index] = -1;
14717 if (!isNoopShuffleMask(V1Mask))
14718 return SDValue();
14719 }
14720
14721 // Check for a single input from a SCALAR_TO_VECTOR node.
14722 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14723 // all the smarts here sunk into that routine. However, the current
14724 // lowering of BUILD_VECTOR makes that nearly impossible until the old
14725 // vector shuffle lowering is dead.
14726 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14727 DAG);
14728 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14729 // We need to zext the scalar if it is smaller than an i32.
14730 V2S = DAG.getBitcast(EltVT, V2S);
14731 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14732 // Using zext to expand a narrow element won't work for non-zero
14733 // insertions. But we can use a masked constant vector if we're
14734 // inserting V2 into the bottom of V1.
14735 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
14736 return SDValue();
14737
14738 // Zero-extend directly to i32.
14739 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14740 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14741
14742 // If we're inserting into a constant, mask off the inserted index
14743 // and OR with the zero-extended scalar.
14744 if (!IsV1Zeroable) {
14745 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
14746 Bits[V2Index] = APInt::getZero(EltBits);
14747 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
14748 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
14749 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14750 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
14751 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
14752 }
14753 }
14754 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14755 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14756 EltVT == MVT::i16) {
14757 // Either not inserting from the low element of the input or the input
14758 // element size is too small to use VZEXT_MOVL to clear the high bits.
14759 return SDValue();
14760 }
14761
14762 if (!IsV1Zeroable) {
14763 // If V1 can't be treated as a zero vector we have fewer options to lower
14764 // this. We can't support integer vectors or non-zero targets cheaply.
14765 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14765, __extension__
__PRETTY_FUNCTION__))
;
14766 if (!VT.isFloatingPoint() || V2Index != 0)
14767 return SDValue();
14768 if (!VT.is128BitVector())
14769 return SDValue();
14770
14771 // Otherwise, use MOVSD, MOVSS or MOVSH.
14772 unsigned MovOpc = 0;
14773 if (EltVT == MVT::f16)
14774 MovOpc = X86ISD::MOVSH;
14775 else if (EltVT == MVT::f32)
14776 MovOpc = X86ISD::MOVSS;
14777 else if (EltVT == MVT::f64)
14778 MovOpc = X86ISD::MOVSD;
14779 else
14780 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14780)
;
14781 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14782 }
14783
14784 // This lowering only works for the low element with floating point vectors.
14785 if (VT.isFloatingPoint() && V2Index != 0)
14786 return SDValue();
14787
14788 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14789 if (ExtVT != VT)
14790 V2 = DAG.getBitcast(VT, V2);
14791
14792 if (V2Index != 0) {
14793 // If we have 4 or fewer lanes we can cheaply shuffle the element into
14794 // the desired position. Otherwise it is more efficient to do a vector
14795 // shift left. We know that we can do a vector shift left because all
14796 // the inputs are zero.
14797 if (VT.isFloatingPoint() || NumElts <= 4) {
14798 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14799 V2Shuffle[V2Index] = 0;
14800 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14801 } else {
14802 V2 = DAG.getBitcast(MVT::v16i8, V2);
14803 V2 = DAG.getNode(
14804 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14805 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
14806 V2 = DAG.getBitcast(VT, V2);
14807 }
14808 }
14809 return V2;
14810}
14811
14812/// Try to lower broadcast of a single - truncated - integer element,
14813/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14814///
14815/// This assumes we have AVX2.
14816static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14817 int BroadcastIdx,
14818 const X86Subtarget &Subtarget,
14819 SelectionDAG &DAG) {
14820 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__
__PRETTY_FUNCTION__))
14821 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__
__PRETTY_FUNCTION__))
;
14822
14823 MVT EltVT = VT.getVectorElementType();
14824 MVT V0VT = V0.getSimpleValueType();
14825
14826 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14826, __extension__
__PRETTY_FUNCTION__))
;
14827 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14827, __extension__
__PRETTY_FUNCTION__))
;
14828
14829 MVT V0EltVT = V0VT.getVectorElementType();
14830 if (!V0EltVT.isInteger())
14831 return SDValue();
14832
14833 const unsigned EltSize = EltVT.getSizeInBits();
14834 const unsigned V0EltSize = V0EltVT.getSizeInBits();
14835
14836 // This is only a truncation if the original element type is larger.
14837 if (V0EltSize <= EltSize)
14838 return SDValue();
14839
14840 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__
__PRETTY_FUNCTION__))
14841 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__
__PRETTY_FUNCTION__))
;
14842
14843 const unsigned V0Opc = V0.getOpcode();
14844 const unsigned Scale = V0EltSize / EltSize;
14845 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14846
14847 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14848 V0Opc != ISD::BUILD_VECTOR)
14849 return SDValue();
14850
14851 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14852
14853 // If we're extracting non-least-significant bits, shift so we can truncate.
14854 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14855 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14856 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14857 if (const int OffsetIdx = BroadcastIdx % Scale)
14858 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14859 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14860
14861 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14862 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14863}
14864
14865/// Test whether this can be lowered with a single SHUFPS instruction.
14866///
14867/// This is used to disable more specialized lowerings when the shufps lowering
14868/// will happen to be efficient.
14869static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14870 // This routine only handles 128-bit shufps.
14871 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14871, __extension__
__PRETTY_FUNCTION__))
;
14872 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14872, __extension__
__PRETTY_FUNCTION__))
;
14873 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14873, __extension__
__PRETTY_FUNCTION__))
;
14874 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14874, __extension__
__PRETTY_FUNCTION__))
;
14875 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14875, __extension__
__PRETTY_FUNCTION__))
;
14876
14877 // To lower with a single SHUFPS we need to have the low half and high half
14878 // each requiring a single input.
14879 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14880 return false;
14881 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14882 return false;
14883
14884 return true;
14885}
14886
14887/// Test whether the specified input (0 or 1) is in-place blended by the
14888/// given mask.
14889///
14890/// This returns true if the elements from a particular input are already in the
14891/// slot required by the given mask and require no permutation.
14892static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
14893 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14893, __extension__
__PRETTY_FUNCTION__))
;
14894 int Size = Mask.size();
14895 for (int i = 0; i < Size; ++i)
14896 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
14897 return false;
14898
14899 return true;
14900}
14901
14902/// If we are extracting two 128-bit halves of a vector and shuffling the
14903/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14904/// multi-shuffle lowering.
14905static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14906 SDValue N1, ArrayRef<int> Mask,
14907 SelectionDAG &DAG) {
14908 MVT VT = N0.getSimpleValueType();
14909 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))
14910 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))
14911 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))
;
14912
14913 // Check that both sources are extracts of the same source vector.
14914 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14915 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14916 N0.getOperand(0) != N1.getOperand(0) ||
14917 !N0.hasOneUse() || !N1.hasOneUse())
14918 return SDValue();
14919
14920 SDValue WideVec = N0.getOperand(0);
14921 MVT WideVT = WideVec.getSimpleValueType();
14922 if (!WideVT.is256BitVector())
14923 return SDValue();
14924
14925 // Match extracts of each half of the wide source vector. Commute the shuffle
14926 // if the extract of the low half is N1.
14927 unsigned NumElts = VT.getVectorNumElements();
14928 SmallVector<int, 4> NewMask(Mask);
14929 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14930 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14931 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14932 ShuffleVectorSDNode::commuteMask(NewMask);
14933 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14934 return SDValue();
14935
14936 // Final bailout: if the mask is simple, we are better off using an extract
14937 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14938 // because that avoids a constant load from memory.
14939 if (NumElts == 4 &&
14940 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
14941 return SDValue();
14942
14943 // Extend the shuffle mask with undef elements.
14944 NewMask.append(NumElts, -1);
14945
14946 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14947 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14948 NewMask);
14949 // This is free: ymm -> xmm.
14950 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14951 DAG.getIntPtrConstant(0, DL));
14952}
14953
14954/// Try to lower broadcast of a single element.
14955///
14956/// For convenience, this code also bundles all of the subtarget feature set
14957/// filtering. While a little annoying to re-dispatch on type here, there isn't
14958/// a convenient way to factor it out.
14959static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14960 SDValue V2, ArrayRef<int> Mask,
14961 const X86Subtarget &Subtarget,
14962 SelectionDAG &DAG) {
14963 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14964 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14965 (Subtarget.hasAVX2() && VT.isInteger())))
14966 return SDValue();
14967
14968 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14969 // we can only broadcast from a register with AVX2.
14970 unsigned NumEltBits = VT.getScalarSizeInBits();
14971 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14972 ? X86ISD::MOVDDUP
14973 : X86ISD::VBROADCAST;
14974 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14975
14976 // Check that the mask is a broadcast.
14977 int BroadcastIdx = getSplatIndex(Mask);
14978 if (BroadcastIdx < 0)
14979 return SDValue();
14980 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))
14981 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))
14982 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))
;
14983
14984 // Go up the chain of (vector) values to find a scalar load that we can
14985 // combine with the broadcast.
14986 // TODO: Combine this logic with findEltLoadSrc() used by
14987 // EltsFromConsecutiveLoads().
14988 int BitOffset = BroadcastIdx * NumEltBits;
14989 SDValue V = V1;
14990 for (;;) {
14991 switch (V.getOpcode()) {
14992 case ISD::BITCAST: {
14993 V = V.getOperand(0);
14994 continue;
14995 }
14996 case ISD::CONCAT_VECTORS: {
14997 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14998 int OpIdx = BitOffset / OpBitWidth;
14999 V = V.getOperand(OpIdx);
15000 BitOffset %= OpBitWidth;
15001 continue;
15002 }
15003 case ISD::EXTRACT_SUBVECTOR: {
15004 // The extraction index adds to the existing offset.
15005 unsigned EltBitWidth = V.getScalarValueSizeInBits();
15006 unsigned Idx = V.getConstantOperandVal(1);
15007 unsigned BeginOffset = Idx * EltBitWidth;
15008 BitOffset += BeginOffset;
15009 V = V.getOperand(0);
15010 continue;
15011 }
15012 case ISD::INSERT_SUBVECTOR: {
15013 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
15014 int EltBitWidth = VOuter.getScalarValueSizeInBits();
15015 int Idx = (int)V.getConstantOperandVal(2);
15016 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
15017 int BeginOffset = Idx * EltBitWidth;
15018 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
15019 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
15020 BitOffset -= BeginOffset;
15021 V = VInner;
15022 } else {
15023 V = VOuter;
15024 }
15025 continue;
15026 }
15027 }
15028 break;
15029 }
15030 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15030, __extension__
__PRETTY_FUNCTION__))
;
15031 BroadcastIdx = BitOffset / NumEltBits;
15032
15033 // Do we need to bitcast the source to retrieve the original broadcast index?
15034 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
15035
15036 // Check if this is a broadcast of a scalar. We special case lowering
15037 // for scalars so that we can more effectively fold with loads.
15038 // If the original value has a larger element type than the shuffle, the
15039 // broadcast element is in essence truncated. Make that explicit to ease
15040 // folding.
15041 if (BitCastSrc && VT.isInteger())
15042 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
15043 DL, VT, V, BroadcastIdx, Subtarget, DAG))
15044 return TruncBroadcast;
15045
15046 // Also check the simpler case, where we can directly reuse the scalar.
15047 if (!BitCastSrc &&
15048 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
15049 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
15050 V = V.getOperand(BroadcastIdx);
15051
15052 // If we can't broadcast from a register, check that the input is a load.
15053 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
15054 return SDValue();
15055 } else if (ISD::isNormalLoad(V.getNode()) &&
15056 cast<LoadSDNode>(V)->isSimple()) {
15057 // We do not check for one-use of the vector load because a broadcast load
15058 // is expected to be a win for code size, register pressure, and possibly
15059 // uops even if the original vector load is not eliminated.
15060
15061 // Reduce the vector load and shuffle to a broadcasted scalar load.
15062 LoadSDNode *Ld = cast<LoadSDNode>(V);
15063 SDValue BaseAddr = Ld->getOperand(1);
15064 MVT SVT = VT.getScalarType();
15065 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
15066 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15066, __extension__
__PRETTY_FUNCTION__))
;
15067 SDValue NewAddr =
15068 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
15069
15070 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
15071 // than MOVDDUP.
15072 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
15073 if (Opcode == X86ISD::VBROADCAST) {
15074 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
15075 SDValue Ops[] = {Ld->getChain(), NewAddr};
15076 V = DAG.getMemIntrinsicNode(
15077 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
15078 DAG.getMachineFunction().getMachineMemOperand(
15079 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15080 DAG.makeEquivalentMemoryOrdering(Ld, V);
15081 return DAG.getBitcast(VT, V);
15082 }
15083 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15083, __extension__
__PRETTY_FUNCTION__))
;
15084 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
15085 DAG.getMachineFunction().getMachineMemOperand(
15086 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15087 DAG.makeEquivalentMemoryOrdering(Ld, V);
15088 } else if (!BroadcastFromReg) {
15089 // We can't broadcast from a vector register.
15090 return SDValue();
15091 } else if (BitOffset != 0) {
15092 // We can only broadcast from the zero-element of a vector register,
15093 // but it can be advantageous to broadcast from the zero-element of a
15094 // subvector.
15095 if (!VT.is256BitVector() && !VT.is512BitVector())
15096 return SDValue();
15097
15098 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
15099 if (VT == MVT::v4f64 || VT == MVT::v4i64)
15100 return SDValue();
15101
15102 // Only broadcast the zero-element of a 128-bit subvector.
15103 if ((BitOffset % 128) != 0)
15104 return SDValue();
15105
15106 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__
__PRETTY_FUNCTION__))
15107 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__
__PRETTY_FUNCTION__))
;
15108 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__
__PRETTY_FUNCTION__))
15109 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__
__PRETTY_FUNCTION__))
;
15110 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
15111 V = extract128BitVector(V, ExtractIdx, DAG, DL);
15112 }
15113
15114 // On AVX we can use VBROADCAST directly for scalar sources.
15115 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
15116 V = DAG.getBitcast(MVT::f64, V);
15117 if (Subtarget.hasAVX()) {
15118 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
15119 return DAG.getBitcast(VT, V);
15120 }
15121 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
15122 }
15123
15124 // If this is a scalar, do the broadcast on this type and bitcast.
15125 if (!V.getValueType().isVector()) {
15126 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__
__PRETTY_FUNCTION__))
15127 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__
__PRETTY_FUNCTION__))
;
15128 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
15129 VT.getVectorNumElements());
15130 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
15131 }
15132
15133 // We only support broadcasting from 128-bit vectors to minimize the
15134 // number of patterns we need to deal with in isel. So extract down to
15135 // 128-bits, removing as many bitcasts as possible.
15136 if (V.getValueSizeInBits() > 128)
15137 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
15138
15139 // Otherwise cast V to a vector with the same element type as VT, but
15140 // possibly narrower than VT. Then perform the broadcast.
15141 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
15142 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
15143 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
15144}
15145
15146// Check for whether we can use INSERTPS to perform the shuffle. We only use
15147// INSERTPS when the V1 elements are already in the correct locations
15148// because otherwise we can just always use two SHUFPS instructions which
15149// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
15150// perform INSERTPS if a single V1 element is out of place and all V2
15151// elements are zeroable.
15152static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
15153 unsigned &InsertPSMask,
15154 const APInt &Zeroable,
15155 ArrayRef<int> Mask, SelectionDAG &DAG) {
15156 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15156, __extension__
__PRETTY_FUNCTION__))
;
15157 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15157, __extension__
__PRETTY_FUNCTION__))
;
15158 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15158, __extension__
__PRETTY_FUNCTION__))
;
15159
15160 // Attempt to match INSERTPS with one element from VA or VB being
15161 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
15162 // are updated.
15163 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
15164 ArrayRef<int> CandidateMask) {
15165 unsigned ZMask = 0;
15166 int VADstIndex = -1;
15167 int VBDstIndex = -1;
15168 bool VAUsedInPlace = false;
15169
15170 for (int i = 0; i < 4; ++i) {
15171 // Synthesize a zero mask from the zeroable elements (includes undefs).
15172 if (Zeroable[i]) {
15173 ZMask |= 1 << i;
15174 continue;
15175 }
15176
15177 // Flag if we use any VA inputs in place.
15178 if (i == CandidateMask[i]) {
15179 VAUsedInPlace = true;
15180 continue;
15181 }
15182
15183 // We can only insert a single non-zeroable element.
15184 if (VADstIndex >= 0 || VBDstIndex >= 0)
15185 return false;
15186
15187 if (CandidateMask[i] < 4) {
15188 // VA input out of place for insertion.
15189 VADstIndex = i;
15190 } else {
15191 // VB input for insertion.
15192 VBDstIndex = i;
15193 }
15194 }
15195
15196 // Don't bother if we have no (non-zeroable) element for insertion.
15197 if (VADstIndex < 0 && VBDstIndex < 0)
15198 return false;
15199
15200 // Determine element insertion src/dst indices. The src index is from the
15201 // start of the inserted vector, not the start of the concatenated vector.
15202 unsigned VBSrcIndex = 0;
15203 if (VADstIndex >= 0) {
15204 // If we have a VA input out of place, we use VA as the V2 element
15205 // insertion and don't use the original V2 at all.
15206 VBSrcIndex = CandidateMask[VADstIndex];
15207 VBDstIndex = VADstIndex;
15208 VB = VA;
15209 } else {
15210 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
15211 }
15212
15213 // If no V1 inputs are used in place, then the result is created only from
15214 // the zero mask and the V2 insertion - so remove V1 dependency.
15215 if (!VAUsedInPlace)
15216 VA = DAG.getUNDEF(MVT::v4f32);
15217
15218 // Update V1, V2 and InsertPSMask accordingly.
15219 V1 = VA;
15220 V2 = VB;
15221
15222 // Insert the V2 element into the desired position.
15223 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
15224 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15224, __extension__
__PRETTY_FUNCTION__))
;
15225 return true;
15226 };
15227
15228 if (matchAsInsertPS(V1, V2, Mask))
15229 return true;
15230
15231 // Commute and try again.
15232 SmallVector<int, 4> CommutedMask(Mask);
15233 ShuffleVectorSDNode::commuteMask(CommutedMask);
15234 if (matchAsInsertPS(V2, V1, CommutedMask))
15235 return true;
15236
15237 return false;
15238}
15239
15240static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
15241 ArrayRef<int> Mask, const APInt &Zeroable,
15242 SelectionDAG &DAG) {
15243 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15243, __extension__
__PRETTY_FUNCTION__))
;
15244 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15244, __extension__
__PRETTY_FUNCTION__))
;
15245
15246 // Attempt to match the insertps pattern.
15247 unsigned InsertPSMask = 0;
15248 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
15249 return SDValue();
15250
15251 // Insert the V2 element into the desired position.
15252 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
15253 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
15254}
15255
15256/// Handle lowering of 2-lane 64-bit floating point shuffles.
15257///
15258/// This is the basis function for the 2-lane 64-bit shuffles as we have full
15259/// support for floating point shuffles but not integer shuffles. These
15260/// instructions will incur a domain crossing penalty on some chips though so
15261/// it is better to avoid lowering through this for integer vectors where
15262/// possible.
15263static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15264 const APInt &Zeroable, SDValue V1, SDValue V2,
15265 const X86Subtarget &Subtarget,
15266 SelectionDAG &DAG) {
15267 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15267, __extension__
__PRETTY_FUNCTION__))
;
15268 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15268, __extension__
__PRETTY_FUNCTION__))
;
15269 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15269, __extension__
__PRETTY_FUNCTION__))
;
15270
15271 if (V2.isUndef()) {
15272 // Check for being able to broadcast a single element.
15273 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
15274 Mask, Subtarget, DAG))
15275 return Broadcast;
15276
15277 // Straight shuffle of a single input vector. Simulate this by using the
15278 // single input as both of the "inputs" to this instruction..
15279 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
15280
15281 if (Subtarget.hasAVX()) {
15282 // If we have AVX, we can use VPERMILPS which will allow folding a load
15283 // into the shuffle.
15284 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
15285 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15286 }
15287
15288 return DAG.getNode(
15289 X86ISD::SHUFP, DL, MVT::v2f64,
15290 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15291 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15292 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15293 }
15294 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15294, __extension__
__PRETTY_FUNCTION__))
;
15295 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15295, __extension__
__PRETTY_FUNCTION__))
;
15296 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15296, __extension__
__PRETTY_FUNCTION__))
;
15297 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15297, __extension__
__PRETTY_FUNCTION__))
;
15298
15299 if (Subtarget.hasAVX2())
15300 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15301 return Extract;
15302
15303 // When loading a scalar and then shuffling it into a vector we can often do
15304 // the insertion cheaply.
15305 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15306 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15307 return Insertion;
15308 // Try inverting the insertion since for v2 masks it is easy to do and we
15309 // can't reliably sort the mask one way or the other.
15310 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
15311 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
15312 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15313 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15314 return Insertion;
15315
15316 // Try to use one of the special instruction patterns to handle two common
15317 // blend patterns if a zero-blend above didn't work.
15318 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
15319 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
15320 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
15321 // We can either use a special instruction to load over the low double or
15322 // to move just the low double.
15323 return DAG.getNode(
15324 X86ISD::MOVSD, DL, MVT::v2f64, V2,
15325 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
15326
15327 if (Subtarget.hasSSE41())
15328 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
15329 Zeroable, Subtarget, DAG))
15330 return Blend;
15331
15332 // Use dedicated unpack instructions for masks that match their pattern.
15333 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
15334 return V;
15335
15336 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
15337 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
15338 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15339}
15340
15341/// Handle lowering of 2-lane 64-bit integer shuffles.
15342///
15343/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
15344/// the integer unit to minimize domain crossing penalties. However, for blends
15345/// it falls back to the floating point shuffle operation with appropriate bit
15346/// casting.
15347static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15348 const APInt &Zeroable, SDValue V1, SDValue V2,
15349 const X86Subtarget &Subtarget,
15350 SelectionDAG &DAG) {
15351 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15351, __extension__
__PRETTY_FUNCTION__))
;
15352 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15352, __extension__
__PRETTY_FUNCTION__))
;
15353 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15353, __extension__
__PRETTY_FUNCTION__))
;
15354
15355 if (V2.isUndef()) {
15356 // Check for being able to broadcast a single element.
15357 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
15358 Mask, Subtarget, DAG))
15359 return Broadcast;
15360
15361 // Straight shuffle of a single input vector. For everything from SSE2
15362 // onward this has a single fast instruction with no scary immediates.
15363 // We have to map the mask as it is actually a v4i32 shuffle instruction.
15364 V1 = DAG.getBitcast(MVT::v4i32, V1);
15365 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
15366 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
15367 Mask[1] < 0 ? -1 : (Mask[1] * 2),
15368 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
15369 return DAG.getBitcast(
15370 MVT::v2i64,
15371 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15372 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
15373 }
15374 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15374, __extension__
__PRETTY_FUNCTION__))
;
15375 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15375, __extension__
__PRETTY_FUNCTION__))
;
15376 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15376, __extension__
__PRETTY_FUNCTION__))
;
15377 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15377, __extension__
__PRETTY_FUNCTION__))
;
15378
15379 if (Subtarget.hasAVX2())
15380 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15381 return Extract;
15382
15383 // Try to use shift instructions.
15384 if (SDValue Shift =
15385 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
15386 DAG, /*BitwiseOnly*/ false))
15387 return Shift;
15388
15389 // When loading a scalar and then shuffling it into a vector we can often do
15390 // the insertion cheaply.
15391 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15392 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15393 return Insertion;
15394 // Try inverting the insertion since for v2 masks it is easy to do and we
15395 // can't reliably sort the mask one way or the other.
15396 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
15397 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15398 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15399 return Insertion;
15400
15401 // We have different paths for blend lowering, but they all must use the
15402 // *exact* same predicate.
15403 bool IsBlendSupported = Subtarget.hasSSE41();
15404 if (IsBlendSupported)
15405 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
15406 Zeroable, Subtarget, DAG))
15407 return Blend;
15408
15409 // Use dedicated unpack instructions for masks that match their pattern.
15410 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
15411 return V;
15412
15413 // Try to use byte rotation instructions.
15414 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15415 if (Subtarget.hasSSSE3()) {
15416 if (Subtarget.hasVLX())
15417 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
15418 Subtarget, DAG))
15419 return Rotate;
15420
15421 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
15422 Subtarget, DAG))
15423 return Rotate;
15424 }
15425
15426 // If we have direct support for blends, we should lower by decomposing into
15427 // a permute. That will be faster than the domain cross.
15428 if (IsBlendSupported)
15429 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
15430 Subtarget, DAG);
15431
15432 // We implement this with SHUFPD which is pretty lame because it will likely
15433 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
15434 // However, all the alternatives are still more cycles and newer chips don't
15435 // have this problem. It would be really nice if x86 had better shuffles here.
15436 V1 = DAG.getBitcast(MVT::v2f64, V1);
15437 V2 = DAG.getBitcast(MVT::v2f64, V2);
15438 return DAG.getBitcast(MVT::v2i64,
15439 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
15440}
15441
15442/// Lower a vector shuffle using the SHUFPS instruction.
15443///
15444/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
15445/// It makes no assumptions about whether this is the *best* lowering, it simply
15446/// uses it.
15447static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
15448 ArrayRef<int> Mask, SDValue V1,
15449 SDValue V2, SelectionDAG &DAG) {
15450 SDValue LowV = V1, HighV = V2;
15451 SmallVector<int, 4> NewMask(Mask);
15452 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15453
15454 if (NumV2Elements == 1) {
15455 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
15456
15457 // Compute the index adjacent to V2Index and in the same half by toggling
15458 // the low bit.
15459 int V2AdjIndex = V2Index ^ 1;
15460
15461 if (Mask[V2AdjIndex] < 0) {
15462 // Handles all the cases where we have a single V2 element and an undef.
15463 // This will only ever happen in the high lanes because we commute the
15464 // vector otherwise.
15465 if (V2Index < 2)
15466 std::swap(LowV, HighV);
15467 NewMask[V2Index] -= 4;
15468 } else {
15469 // Handle the case where the V2 element ends up adjacent to a V1 element.
15470 // To make this work, blend them together as the first step.
15471 int V1Index = V2AdjIndex;
15472 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
15473 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
15474 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15475
15476 // Now proceed to reconstruct the final blend as we have the necessary
15477 // high or low half formed.
15478 if (V2Index < 2) {
15479 LowV = V2;
15480 HighV = V1;
15481 } else {
15482 HighV = V2;
15483 }
15484 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
15485 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
15486 }
15487 } else if (NumV2Elements == 2) {
15488 if (Mask[0] < 4 && Mask[1] < 4) {
15489 // Handle the easy case where we have V1 in the low lanes and V2 in the
15490 // high lanes.
15491 NewMask[2] -= 4;
15492 NewMask[3] -= 4;
15493 } else if (Mask[2] < 4 && Mask[3] < 4) {
15494 // We also handle the reversed case because this utility may get called
15495 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
15496 // arrange things in the right direction.
15497 NewMask[0] -= 4;
15498 NewMask[1] -= 4;
15499 HighV = V1;
15500 LowV = V2;
15501 } else {
15502 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
15503 // trying to place elements directly, just blend them and set up the final
15504 // shuffle to place them.
15505
15506 // The first two blend mask elements are for V1, the second two are for
15507 // V2.
15508 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
15509 Mask[2] < 4 ? Mask[2] : Mask[3],
15510 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
15511 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
15512 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15513 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15514
15515 // Now we do a normal shuffle of V1 by giving V1 as both operands to
15516 // a blend.
15517 LowV = HighV = V1;
15518 NewMask[0] = Mask[0] < 4 ? 0 : 2;
15519 NewMask[1] = Mask[0] < 4 ? 2 : 0;
15520 NewMask[2] = Mask[2] < 4 ? 1 : 3;
15521 NewMask[3] = Mask[2] < 4 ? 3 : 1;
15522 }
15523 } else if (NumV2Elements == 3) {
15524 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
15525 // we can get here due to other paths (e.g repeated mask matching) that we
15526 // don't want to do another round of lowerVECTOR_SHUFFLE.
15527 ShuffleVectorSDNode::commuteMask(NewMask);
15528 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
15529 }
15530 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
15531 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
15532}
15533
15534/// Lower 4-lane 32-bit floating point shuffles.
15535///
15536/// Uses instructions exclusively from the floating point unit to minimize
15537/// domain crossing penalties, as these are sufficient to implement all v4f32
15538/// shuffles.
15539static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15540 const APInt &Zeroable, SDValue V1, SDValue V2,
15541 const X86Subtarget &Subtarget,
15542 SelectionDAG &DAG) {
15543 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15543, __extension__
__PRETTY_FUNCTION__))
;
15544 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15544, __extension__
__PRETTY_FUNCTION__))
;
15545 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15545, __extension__
__PRETTY_FUNCTION__))
;
15546
15547 if (Subtarget.hasSSE41())
15548 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15549 Zeroable, Subtarget, DAG))
15550 return Blend;
15551
15552 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15553
15554 if (NumV2Elements == 0) {
15555 // Check for being able to broadcast a single element.
15556 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
15557 Mask, Subtarget, DAG))
15558 return Broadcast;
15559
15560 // Use even/odd duplicate instructions for masks that match their pattern.
15561 if (Subtarget.hasSSE3()) {
15562 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15563 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
15564 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
15565 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
15566 }
15567
15568 if (Subtarget.hasAVX()) {
15569 // If we have AVX, we can use VPERMILPS which will allow folding a load
15570 // into the shuffle.
15571 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
15572 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15573 }
15574
15575 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
15576 // in SSE1 because otherwise they are widened to v2f64 and never get here.
15577 if (!Subtarget.hasSSE2()) {
15578 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
15579 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
15580 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
15581 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
15582 }
15583
15584 // Otherwise, use a straight shuffle of a single input vector. We pass the
15585 // input vector to both operands to simulate this with a SHUFPS.
15586 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
15587 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15588 }
15589
15590 if (Subtarget.hasSSE2())
15591 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
15592 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
15593 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
15594 return ZExt;
15595 }
15596
15597 if (Subtarget.hasAVX2())
15598 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15599 return Extract;
15600
15601 // There are special ways we can lower some single-element blends. However, we
15602 // have custom ways we can lower more complex single-element blends below that
15603 // we defer to if both this and BLENDPS fail to match, so restrict this to
15604 // when the V2 input is targeting element 0 of the mask -- that is the fast
15605 // case here.
15606 if (NumV2Elements == 1 && Mask[0] >= 4)
15607 if (SDValue V = lowerShuffleAsElementInsertion(
15608 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15609 return V;
15610
15611 if (Subtarget.hasSSE41()) {
15612 // Use INSERTPS if we can complete the shuffle efficiently.
15613 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
15614 return V;
15615
15616 if (!isSingleSHUFPSMask(Mask))
15617 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
15618 V2, Mask, DAG))
15619 return BlendPerm;
15620 }
15621
15622 // Use low/high mov instructions. These are only valid in SSE1 because
15623 // otherwise they are widened to v2f64 and never get here.
15624 if (!Subtarget.hasSSE2()) {
15625 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15626 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15627 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15628 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15629 }
15630
15631 // Use dedicated unpack instructions for masks that match their pattern.
15632 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15633 return V;
15634
15635 // Otherwise fall back to a SHUFPS lowering strategy.
15636 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15637}
15638
15639/// Lower 4-lane i32 vector shuffles.
15640///
15641/// We try to handle these with integer-domain shuffles where we can, but for
15642/// blends we use the floating point domain blend instructions.
15643static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15644 const APInt &Zeroable, SDValue V1, SDValue V2,
15645 const X86Subtarget &Subtarget,
15646 SelectionDAG &DAG) {
15647 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15647, __extension__
__PRETTY_FUNCTION__))
;
15648 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15648, __extension__
__PRETTY_FUNCTION__))
;
15649 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15649, __extension__
__PRETTY_FUNCTION__))
;
15650
15651 // Whenever we can lower this as a zext, that instruction is strictly faster
15652 // than any alternative. It also allows us to fold memory operands into the
15653 // shuffle in many cases.
15654 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15655 Zeroable, Subtarget, DAG))
15656 return ZExt;
15657
15658 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15659
15660 // Try to use shift instructions if fast.
15661 if (Subtarget.preferLowerShuffleAsShift()) {
15662 if (SDValue Shift =
15663 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
15664 Subtarget, DAG, /*BitwiseOnly*/ true))
15665 return Shift;
15666 if (NumV2Elements == 0)
15667 if (SDValue Rotate =
15668 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
15669 return Rotate;
15670 }
15671
15672 if (NumV2Elements == 0) {
15673 // Try to use broadcast unless the mask only has one non-undef element.
15674 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15675 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15676 Mask, Subtarget, DAG))
15677 return Broadcast;
15678 }
15679
15680 // Straight shuffle of a single input vector. For everything from SSE2
15681 // onward this has a single fast instruction with no scary immediates.
15682 // We coerce the shuffle pattern to be compatible with UNPCK instructions
15683 // but we aren't actually going to use the UNPCK instruction because doing
15684 // so prevents folding a load into this instruction or making a copy.
15685 const int UnpackLoMask[] = {0, 0, 1, 1};
15686 const int UnpackHiMask[] = {2, 2, 3, 3};
15687 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15688 Mask = UnpackLoMask;
15689 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15690 Mask = UnpackHiMask;
15691
15692 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15693 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15694 }
15695
15696 if (Subtarget.hasAVX2())
15697 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15698 return Extract;
15699
15700 // Try to use shift instructions.
15701 if (SDValue Shift =
15702 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
15703 DAG, /*BitwiseOnly*/ false))
15704 return Shift;
15705
15706 // There are special ways we can lower some single-element blends.
15707 if (NumV2Elements == 1)
15708 if (SDValue V = lowerShuffleAsElementInsertion(
15709 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15710 return V;
15711
15712 // We have different paths for blend lowering, but they all must use the
15713 // *exact* same predicate.
15714 bool IsBlendSupported = Subtarget.hasSSE41();
15715 if (IsBlendSupported)
15716 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15717 Zeroable, Subtarget, DAG))
15718 return Blend;
15719
15720 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15721 Zeroable, Subtarget, DAG))
15722 return Masked;
15723
15724 // Use dedicated unpack instructions for masks that match their pattern.
15725 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15726 return V;
15727
15728 // Try to use byte rotation instructions.
15729 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15730 if (Subtarget.hasSSSE3()) {
15731 if (Subtarget.hasVLX())
15732 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15733 Subtarget, DAG))
15734 return Rotate;
15735
15736 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15737 Subtarget, DAG))
15738 return Rotate;
15739 }
15740
15741 // Assume that a single SHUFPS is faster than an alternative sequence of
15742 // multiple instructions (even if the CPU has a domain penalty).
15743 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15744 if (!isSingleSHUFPSMask(Mask)) {
15745 // If we have direct support for blends, we should lower by decomposing into
15746 // a permute. That will be faster than the domain cross.
15747 if (IsBlendSupported)
15748 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15749 Subtarget, DAG);
15750
15751 // Try to lower by permuting the inputs into an unpack instruction.
15752 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15753 Mask, Subtarget, DAG))
15754 return Unpack;
15755 }
15756
15757 // We implement this with SHUFPS because it can blend from two vectors.
15758 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15759 // up the inputs, bypassing domain shift penalties that we would incur if we
15760 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15761 // relevant.
15762 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15763 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15764 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15765 return DAG.getBitcast(MVT::v4i32, ShufPS);
15766}
15767
15768/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15769/// shuffle lowering, and the most complex part.
15770///
15771/// The lowering strategy is to try to form pairs of input lanes which are
15772/// targeted at the same half of the final vector, and then use a dword shuffle
15773/// to place them onto the right half, and finally unpack the paired lanes into
15774/// their final position.
15775///
15776/// The exact breakdown of how to form these dword pairs and align them on the
15777/// correct sides is really tricky. See the comments within the function for
15778/// more of the details.
15779///
15780/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15781/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15782/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15783/// vector, form the analogous 128-bit 8-element Mask.
15784static SDValue lowerV8I16GeneralSingleInputShuffle(
15785 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15786 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15787 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15787, __extension__
__PRETTY_FUNCTION__))
;
15788 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15789
15790 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15790, __extension__
__PRETTY_FUNCTION__))
;
15791 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15792 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15793
15794 // Attempt to directly match PSHUFLW or PSHUFHW.
15795 if (isUndefOrInRange(LoMask, 0, 4) &&
15796 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15797 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15798 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15799 }
15800 if (isUndefOrInRange(HiMask, 4, 8) &&
15801 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15802 for (int i = 0; i != 4; ++i)
15803 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15804 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15805 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15806 }
15807
15808 SmallVector<int, 4> LoInputs;
15809 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15810 array_pod_sort(LoInputs.begin(), LoInputs.end());
15811 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15812 SmallVector<int, 4> HiInputs;
15813 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15814 array_pod_sort(HiInputs.begin(), HiInputs.end());
15815 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15816 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15817 int NumHToL = LoInputs.size() - NumLToL;
15818 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15819 int NumHToH = HiInputs.size() - NumLToH;
15820 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15821 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15822 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15823 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15824
15825 // If we are shuffling values from one half - check how many different DWORD
15826 // pairs we need to create. If only 1 or 2 then we can perform this as a
15827 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15828 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15829 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15830 V = DAG.getNode(ShufWOp, DL, VT, V,
15831 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15832 V = DAG.getBitcast(PSHUFDVT, V);
15833 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15834 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15835 return DAG.getBitcast(VT, V);
15836 };
15837
15838 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15839 int PSHUFDMask[4] = { -1, -1, -1, -1 };
15840 SmallVector<std::pair<int, int>, 4> DWordPairs;
15841 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15842
15843 // Collect the different DWORD pairs.
15844 for (int DWord = 0; DWord != 4; ++DWord) {
15845 int M0 = Mask[2 * DWord + 0];
15846 int M1 = Mask[2 * DWord + 1];
15847 M0 = (M0 >= 0 ? M0 % 4 : M0);
15848 M1 = (M1 >= 0 ? M1 % 4 : M1);
15849 if (M0 < 0 && M1 < 0)
15850 continue;
15851
15852 bool Match = false;
15853 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15854 auto &DWordPair = DWordPairs[j];
15855 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15856 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15857 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15858 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15859 PSHUFDMask[DWord] = DOffset + j;
15860 Match = true;
15861 break;
15862 }
15863 }
15864 if (!Match) {
15865 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15866 DWordPairs.push_back(std::make_pair(M0, M1));
15867 }
15868 }
15869
15870 if (DWordPairs.size() <= 2) {
15871 DWordPairs.resize(2, std::make_pair(-1, -1));
15872 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15873 DWordPairs[1].first, DWordPairs[1].second};
15874 if ((NumHToL + NumHToH) == 0)
15875 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15876 if ((NumLToL + NumLToH) == 0)
15877 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15878 }
15879 }
15880
15881 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15882 // such inputs we can swap two of the dwords across the half mark and end up
15883 // with <=2 inputs to each half in each half. Once there, we can fall through
15884 // to the generic code below. For example:
15885 //
15886 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15887 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15888 //
15889 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15890 // and an existing 2-into-2 on the other half. In this case we may have to
15891 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15892 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15893 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15894 // because any other situation (including a 3-into-1 or 1-into-3 in the other
15895 // half than the one we target for fixing) will be fixed when we re-enter this
15896 // path. We will also combine away any sequence of PSHUFD instructions that
15897 // result into a single instruction. Here is an example of the tricky case:
15898 //
15899 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15900 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15901 //
15902 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15903 //
15904 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15905 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15906 //
15907 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15908 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15909 //
15910 // The result is fine to be handled by the generic logic.
15911 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15912 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15913 int AOffset, int BOffset) {
15914 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__
__PRETTY_FUNCTION__))
15915 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__
__PRETTY_FUNCTION__))
;
15916 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__
__PRETTY_FUNCTION__))
15917 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__
__PRETTY_FUNCTION__))
;
15918 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__
__PRETTY_FUNCTION__))
15919 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__
__PRETTY_FUNCTION__))
;
15920
15921 bool ThreeAInputs = AToAInputs.size() == 3;
15922
15923 // Compute the index of dword with only one word among the three inputs in
15924 // a half by taking the sum of the half with three inputs and subtracting
15925 // the sum of the actual three inputs. The difference is the remaining
15926 // slot.
15927 int ADWord = 0, BDWord = 0;
15928 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15929 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15930 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15931 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15932 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15933 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15934 int TripleNonInputIdx =
15935 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15936 TripleDWord = TripleNonInputIdx / 2;
15937
15938 // We use xor with one to compute the adjacent DWord to whichever one the
15939 // OneInput is in.
15940 OneInputDWord = (OneInput / 2) ^ 1;
15941
15942 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15943 // and BToA inputs. If there is also such a problem with the BToB and AToB
15944 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15945 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15946 // is essential that we don't *create* a 3<-1 as then we might oscillate.
15947 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15948 // Compute how many inputs will be flipped by swapping these DWords. We
15949 // need
15950 // to balance this to ensure we don't form a 3-1 shuffle in the other
15951 // half.
15952 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15953 llvm::count(AToBInputs, 2 * ADWord + 1);
15954 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15955 llvm::count(BToBInputs, 2 * BDWord + 1);
15956 if ((NumFlippedAToBInputs == 1 &&
15957 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15958 (NumFlippedBToBInputs == 1 &&
15959 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15960 // We choose whether to fix the A half or B half based on whether that
15961 // half has zero flipped inputs. At zero, we may not be able to fix it
15962 // with that half. We also bias towards fixing the B half because that
15963 // will more commonly be the high half, and we have to bias one way.
15964 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15965 ArrayRef<int> Inputs) {
15966 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15967 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15968 // Determine whether the free index is in the flipped dword or the
15969 // unflipped dword based on where the pinned index is. We use this bit
15970 // in an xor to conditionally select the adjacent dword.
15971 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15972 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15973 if (IsFixIdxInput == IsFixFreeIdxInput)
15974 FixFreeIdx += 1;
15975 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15976 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__
__PRETTY_FUNCTION__))
15977 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__
__PRETTY_FUNCTION__))
;
15978 int PSHUFHalfMask[] = {0, 1, 2, 3};
15979 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15980 V = DAG.getNode(
15981 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15982 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15983 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15984
15985 for (int &M : Mask)
15986 if (M >= 0 && M == FixIdx)
15987 M = FixFreeIdx;
15988 else if (M >= 0 && M == FixFreeIdx)
15989 M = FixIdx;
15990 };
15991 if (NumFlippedBToBInputs != 0) {
15992 int BPinnedIdx =
15993 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15994 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15995 } else {
15996 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15996, __extension__
__PRETTY_FUNCTION__))
;
15997 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15998 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15999 }
16000 }
16001 }
16002
16003 int PSHUFDMask[] = {0, 1, 2, 3};
16004 PSHUFDMask[ADWord] = BDWord;
16005 PSHUFDMask[BDWord] = ADWord;
16006 V = DAG.getBitcast(
16007 VT,
16008 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16009 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16010
16011 // Adjust the mask to match the new locations of A and B.
16012 for (int &M : Mask)
16013 if (M >= 0 && M/2 == ADWord)
16014 M = 2 * BDWord + M % 2;
16015 else if (M >= 0 && M/2 == BDWord)
16016 M = 2 * ADWord + M % 2;
16017
16018 // Recurse back into this routine to re-compute state now that this isn't
16019 // a 3 and 1 problem.
16020 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
16021 };
16022 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
16023 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
16024 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
16025 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
16026
16027 // At this point there are at most two inputs to the low and high halves from
16028 // each half. That means the inputs can always be grouped into dwords and
16029 // those dwords can then be moved to the correct half with a dword shuffle.
16030 // We use at most one low and one high word shuffle to collect these paired
16031 // inputs into dwords, and finally a dword shuffle to place them.
16032 int PSHUFLMask[4] = {-1, -1, -1, -1};
16033 int PSHUFHMask[4] = {-1, -1, -1, -1};
16034 int PSHUFDMask[4] = {-1, -1, -1, -1};
16035
16036 // First fix the masks for all the inputs that are staying in their
16037 // original halves. This will then dictate the targets of the cross-half
16038 // shuffles.
16039 auto fixInPlaceInputs =
16040 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
16041 MutableArrayRef<int> SourceHalfMask,
16042 MutableArrayRef<int> HalfMask, int HalfOffset) {
16043 if (InPlaceInputs.empty())
16044 return;
16045 if (InPlaceInputs.size() == 1) {
16046 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
16047 InPlaceInputs[0] - HalfOffset;
16048 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
16049 return;
16050 }
16051 if (IncomingInputs.empty()) {
16052 // Just fix all of the in place inputs.
16053 for (int Input : InPlaceInputs) {
16054 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
16055 PSHUFDMask[Input / 2] = Input / 2;
16056 }
16057 return;
16058 }
16059
16060 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16060, __extension__
__PRETTY_FUNCTION__))
;
16061 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
16062 InPlaceInputs[0] - HalfOffset;
16063 // Put the second input next to the first so that they are packed into
16064 // a dword. We find the adjacent index by toggling the low bit.
16065 int AdjIndex = InPlaceInputs[0] ^ 1;
16066 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
16067 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
16068 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
16069 };
16070 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
16071 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
16072
16073 // Now gather the cross-half inputs and place them into a free dword of
16074 // their target half.
16075 // FIXME: This operation could almost certainly be simplified dramatically to
16076 // look more like the 3-1 fixing operation.
16077 auto moveInputsToRightHalf = [&PSHUFDMask](
16078 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
16079 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
16080 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
16081 int DestOffset) {
16082 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
16083 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
16084 };
16085 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
16086 int Word) {
16087 int LowWord = Word & ~1;
16088 int HighWord = Word | 1;
16089 return isWordClobbered(SourceHalfMask, LowWord) ||
16090 isWordClobbered(SourceHalfMask, HighWord);
16091 };
16092
16093 if (IncomingInputs.empty())
16094 return;
16095
16096 if (ExistingInputs.empty()) {
16097 // Map any dwords with inputs from them into the right half.
16098 for (int Input : IncomingInputs) {
16099 // If the source half mask maps over the inputs, turn those into
16100 // swaps and use the swapped lane.
16101 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
16102 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
16103 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
16104 Input - SourceOffset;
16105 // We have to swap the uses in our half mask in one sweep.
16106 for (int &M : HalfMask)
16107 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
16108 M = Input;
16109 else if (M == Input)
16110 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16111 } else {
16112 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))
16113 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))
16114 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))
;
16115 }
16116 // Note that this correctly re-maps both when we do a swap and when
16117 // we observe the other side of the swap above. We rely on that to
16118 // avoid swapping the members of the input list directly.
16119 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16120 }
16121
16122 // Map the input's dword into the correct half.
16123 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
16124 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
16125 else
16126 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
16127 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
16128 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
;
16129 }
16130
16131 // And just directly shift any other-half mask elements to be same-half
16132 // as we will have mirrored the dword containing the element into the
16133 // same position within that half.
16134 for (int &M : HalfMask)
16135 if (M >= SourceOffset && M < SourceOffset + 4) {
16136 M = M - SourceOffset + DestOffset;
16137 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16137, __extension__
__PRETTY_FUNCTION__))
;
16138 }
16139 return;
16140 }
16141
16142 // Ensure we have the input in a viable dword of its current half. This
16143 // is particularly tricky because the original position may be clobbered
16144 // by inputs being moved and *staying* in that half.
16145 if (IncomingInputs.size() == 1) {
16146 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16147 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
16148 SourceOffset;
16149 SourceHalfMask[InputFixed - SourceOffset] =
16150 IncomingInputs[0] - SourceOffset;
16151 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
16152 InputFixed);
16153 IncomingInputs[0] = InputFixed;
16154 }
16155 } else if (IncomingInputs.size() == 2) {
16156 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
16157 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16158 // We have two non-adjacent or clobbered inputs we need to extract from
16159 // the source half. To do this, we need to map them into some adjacent
16160 // dword slot in the source mask.
16161 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
16162 IncomingInputs[1] - SourceOffset};
16163
16164 // If there is a free slot in the source half mask adjacent to one of
16165 // the inputs, place the other input in it. We use (Index XOR 1) to
16166 // compute an adjacent index.
16167 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
16168 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
16169 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
16170 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16171 InputsFixed[1] = InputsFixed[0] ^ 1;
16172 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
16173 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
16174 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
16175 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
16176 InputsFixed[0] = InputsFixed[1] ^ 1;
16177 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
16178 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
16179 // The two inputs are in the same DWord but it is clobbered and the
16180 // adjacent DWord isn't used at all. Move both inputs to the free
16181 // slot.
16182 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
16183 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
16184 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
16185 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
16186 } else {
16187 // The only way we hit this point is if there is no clobbering
16188 // (because there are no off-half inputs to this half) and there is no
16189 // free slot adjacent to one of the inputs. In this case, we have to
16190 // swap an input with a non-input.
16191 for (int i = 0; i < 4; ++i)
16192 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))
16193 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))
;
16194 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__
__PRETTY_FUNCTION__))
16195 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__
__PRETTY_FUNCTION__))
;
16196
16197 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16198 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
16199
16200 // We also have to update the final source mask in this case because
16201 // it may need to undo the above swap.
16202 for (int &M : FinalSourceHalfMask)
16203 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
16204 M = InputsFixed[1] + SourceOffset;
16205 else if (M == InputsFixed[1] + SourceOffset)
16206 M = (InputsFixed[0] ^ 1) + SourceOffset;
16207
16208 InputsFixed[1] = InputsFixed[0] ^ 1;
16209 }
16210
16211 // Point everything at the fixed inputs.
16212 for (int &M : HalfMask)
16213 if (M == IncomingInputs[0])
16214 M = InputsFixed[0] + SourceOffset;
16215 else if (M == IncomingInputs[1])
16216 M = InputsFixed[1] + SourceOffset;
16217
16218 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
16219 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
16220 }
16221 } else {
16222 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16222)
;
16223 }
16224
16225 // Now hoist the DWord down to the right half.
16226 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
16227 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16227, __extension__
__PRETTY_FUNCTION__))
;
16228 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
16229 for (int &M : HalfMask)
16230 for (int Input : IncomingInputs)
16231 if (M == Input)
16232 M = FreeDWord * 2 + Input % 2;
16233 };
16234 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
16235 /*SourceOffset*/ 4, /*DestOffset*/ 0);
16236 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
16237 /*SourceOffset*/ 0, /*DestOffset*/ 4);
16238
16239 // Now enact all the shuffles we've computed to move the inputs into their
16240 // target half.
16241 if (!isNoopShuffleMask(PSHUFLMask))
16242 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16243 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
16244 if (!isNoopShuffleMask(PSHUFHMask))
16245 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16246 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
16247 if (!isNoopShuffleMask(PSHUFDMask))
16248 V = DAG.getBitcast(
16249 VT,
16250 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16251 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16252
16253 // At this point, each half should contain all its inputs, and we can then
16254 // just shuffle them into their final position.
16255 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__
__PRETTY_FUNCTION__))
16256 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__
__PRETTY_FUNCTION__))
;
16257 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__
__PRETTY_FUNCTION__))
16258 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__
__PRETTY_FUNCTION__))
;
16259
16260 // Do a half shuffle for the low mask.
16261 if (!isNoopShuffleMask(LoMask))
16262 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16263 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
16264
16265 // Do a half shuffle with the high mask after shifting its values down.
16266 for (int &M : HiMask)
16267 if (M >= 0)
16268 M -= 4;
16269 if (!isNoopShuffleMask(HiMask))
16270 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16271 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
16272
16273 return V;
16274}
16275
16276/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
16277/// blend if only one input is used.
16278static SDValue lowerShuffleAsBlendOfPSHUFBs(
16279 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16280 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
16281 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__))
16282 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__))
;
16283
16284 int NumBytes = VT.getSizeInBits() / 8;
16285 int Size = Mask.size();
16286 int Scale = NumBytes / Size;
16287
16288 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16289 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16290 V1InUse = false;
16291 V2InUse = false;
16292
16293 for (int i = 0; i < NumBytes; ++i) {
16294 int M = Mask[i / Scale];
16295 if (M < 0)
16296 continue;
16297
16298 const int ZeroMask = 0x80;
16299 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
16300 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
16301 if (Zeroable[i / Scale])
16302 V1Idx = V2Idx = ZeroMask;
16303
16304 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
16305 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
16306 V1InUse |= (ZeroMask != V1Idx);
16307 V2InUse |= (ZeroMask != V2Idx);
16308 }
16309
16310 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
16311 if (V1InUse)
16312 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
16313 DAG.getBuildVector(ShufVT, DL, V1Mask));
16314 if (V2InUse)
16315 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
16316 DAG.getBuildVector(ShufVT, DL, V2Mask));
16317
16318 // If we need shuffled inputs from both, blend the two.
16319 SDValue V;
16320 if (V1InUse && V2InUse)
16321 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
16322 else
16323 V = V1InUse ? V1 : V2;
16324
16325 // Cast the result back to the correct type.
16326 return DAG.getBitcast(VT, V);
16327}
16328
16329/// Generic lowering of 8-lane i16 shuffles.
16330///
16331/// This handles both single-input shuffles and combined shuffle/blends with
16332/// two inputs. The single input shuffles are immediately delegated to
16333/// a dedicated lowering routine.
16334///
16335/// The blends are lowered in one of three fundamental ways. If there are few
16336/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
16337/// of the input is significantly cheaper when lowered as an interleaving of
16338/// the two inputs, try to interleave them. Otherwise, blend the low and high
16339/// halves of the inputs separately (making them have relatively few inputs)
16340/// and then concatenate them.
16341static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16342 const APInt &Zeroable, SDValue V1, SDValue V2,
16343 const X86Subtarget &Subtarget,
16344 SelectionDAG &DAG) {
16345 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16345, __extension__
__PRETTY_FUNCTION__))
;
16346 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16346, __extension__
__PRETTY_FUNCTION__))
;
16347 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16347, __extension__
__PRETTY_FUNCTION__))
;
16348
16349 // Whenever we can lower this as a zext, that instruction is strictly faster
16350 // than any alternative.
16351 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
16352 Zeroable, Subtarget, DAG))
16353 return ZExt;
16354
16355 // Try to use lower using a truncation.
16356 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16357 Subtarget, DAG))
16358 return V;
16359
16360 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
16361
16362 if (NumV2Inputs == 0) {
16363 // Try to use shift instructions.
16364 if (SDValue Shift =
16365 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
16366 Subtarget, DAG, /*BitwiseOnly*/ false))
16367 return Shift;
16368
16369 // Check for being able to broadcast a single element.
16370 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
16371 Mask, Subtarget, DAG))
16372 return Broadcast;
16373
16374 // Try to use bit rotation instructions.
16375 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
16376 Subtarget, DAG))
16377 return Rotate;
16378
16379 // Use dedicated unpack instructions for masks that match their pattern.
16380 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16381 return V;
16382
16383 // Use dedicated pack instructions for masks that match their pattern.
16384 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16385 Subtarget))
16386 return V;
16387
16388 // Try to use byte rotation instructions.
16389 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
16390 Subtarget, DAG))
16391 return Rotate;
16392
16393 // Make a copy of the mask so it can be modified.
16394 SmallVector<int, 8> MutableMask(Mask);
16395 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
16396 Subtarget, DAG);
16397 }
16398
16399 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))
16400 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))
16401 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))
;
16402
16403 // Try to use shift instructions.
16404 if (SDValue Shift =
16405 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
16406 DAG, /*BitwiseOnly*/ false))
16407 return Shift;
16408
16409 // See if we can use SSE4A Extraction / Insertion.
16410 if (Subtarget.hasSSE4A())
16411 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
16412 Zeroable, DAG))
16413 return V;
16414
16415 // There are special ways we can lower some single-element blends.
16416 if (NumV2Inputs == 1)
16417 if (SDValue V = lowerShuffleAsElementInsertion(
16418 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16419 return V;
16420
16421 // We have different paths for blend lowering, but they all must use the
16422 // *exact* same predicate.
16423 bool IsBlendSupported = Subtarget.hasSSE41();
16424 if (IsBlendSupported)
16425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
16426 Zeroable, Subtarget, DAG))
16427 return Blend;
16428
16429 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
16430 Zeroable, Subtarget, DAG))
16431 return Masked;
16432
16433 // Use dedicated unpack instructions for masks that match their pattern.
16434 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16435 return V;
16436
16437 // Use dedicated pack instructions for masks that match their pattern.
16438 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16439 Subtarget))
16440 return V;
16441
16442 // Try to use lower using a truncation.
16443 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16444 Subtarget, DAG))
16445 return V;
16446
16447 // Try to use byte rotation instructions.
16448 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
16449 Subtarget, DAG))
16450 return Rotate;
16451
16452 if (SDValue BitBlend =
16453 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
16454 return BitBlend;
16455
16456 // Try to use byte shift instructions to mask.
16457 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
16458 Zeroable, Subtarget, DAG))
16459 return V;
16460
16461 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
16462 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
16463 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
16464 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
16465 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
16466 !Subtarget.hasVLX()) {
16467 // Check if this is part of a 256-bit vector truncation.
16468 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
16469 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16470 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
16471 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
16472 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
16473 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
16474 DAG.getTargetConstant(0xEE, DL, MVT::i8));
16475 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
16476 V1 = extract128BitVector(V1V2, 0, DAG, DL);
16477 V2 = extract128BitVector(V1V2, 4, DAG, DL);
16478 } else {
16479 SmallVector<SDValue, 4> DWordClearOps(4,
16480 DAG.getConstant(0, DL, MVT::i32));
16481 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
16482 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
16483 SDValue DWordClearMask =
16484 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
16485 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
16486 DWordClearMask);
16487 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
16488 DWordClearMask);
16489 }
16490 // Now pack things back together.
16491 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
16492 if (NumEvenDrops == 2) {
16493 Result = DAG.getBitcast(MVT::v4i32, Result);
16494 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
16495 }
16496 return Result;
16497 }
16498
16499 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
16500 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
16501 if (NumOddDrops == 1) {
16502 bool HasSSE41 = Subtarget.hasSSE41();
16503 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16504 DAG.getBitcast(MVT::v4i32, V1),
16505 DAG.getTargetConstant(16, DL, MVT::i8));
16506 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16507 DAG.getBitcast(MVT::v4i32, V2),
16508 DAG.getTargetConstant(16, DL, MVT::i8));
16509 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
16510 MVT::v8i16, V1, V2);
16511 }
16512
16513 // Try to lower by permuting the inputs into an unpack instruction.
16514 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
16515 Mask, Subtarget, DAG))
16516 return Unpack;
16517
16518 // If we can't directly blend but can use PSHUFB, that will be better as it
16519 // can both shuffle and set up the inefficient blend.
16520 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
16521 bool V1InUse, V2InUse;
16522 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
16523 Zeroable, DAG, V1InUse, V2InUse);
16524 }
16525
16526 // We can always bit-blend if we have to so the fallback strategy is to
16527 // decompose into single-input permutes and blends/unpacks.
16528 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
16529 Mask, Subtarget, DAG);
16530}
16531
16532/// Lower 8-lane 16-bit floating point shuffles.
16533static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16534 const APInt &Zeroable, SDValue V1, SDValue V2,
16535 const X86Subtarget &Subtarget,
16536 SelectionDAG &DAG) {
16537 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16537, __extension__
__PRETTY_FUNCTION__))
;
16538 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16538, __extension__
__PRETTY_FUNCTION__))
;
16539 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16539, __extension__
__PRETTY_FUNCTION__))
;
16540 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16541
16542 if (Subtarget.hasFP16()) {
16543 if (NumV2Elements == 0) {
16544 // Check for being able to broadcast a single element.
16545 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
16546 Mask, Subtarget, DAG))
16547 return Broadcast;
16548 }
16549 if (NumV2Elements == 1 && Mask[0] >= 8)
16550 if (SDValue V = lowerShuffleAsElementInsertion(
16551 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16552 return V;
16553 }
16554
16555 V1 = DAG.getBitcast(MVT::v8i16, V1);
16556 V2 = DAG.getBitcast(MVT::v8i16, V2);
16557 return DAG.getBitcast(MVT::v8f16,
16558 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
16559}
16560
16561// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
16562// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
16563// the active subvector is extracted.
16564static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
16565 ArrayRef<int> Mask, SDValue V1, SDValue V2,
16566 const X86Subtarget &Subtarget,
16567 SelectionDAG &DAG) {
16568 MVT MaskVT = VT.changeTypeToInteger();
16569 SDValue MaskNode;
16570 MVT ShuffleVT = VT;
16571 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
16572 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
16573 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
16574 ShuffleVT = V1.getSimpleValueType();
16575
16576 // Adjust mask to correct indices for the second input.
16577 int NumElts = VT.getVectorNumElements();
16578 unsigned Scale = 512 / VT.getSizeInBits();
16579 SmallVector<int, 32> AdjustedMask(Mask);
16580 for (int &M : AdjustedMask)
16581 if (NumElts <= M)
16582 M += (Scale - 1) * NumElts;
16583 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
16584 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
16585 } else {
16586 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
16587 }
16588
16589 SDValue Result;
16590 if (V2.isUndef())
16591 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
16592 else
16593 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
16594
16595 if (VT != ShuffleVT)
16596 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
16597
16598 return Result;
16599}
16600
16601/// Generic lowering of v16i8 shuffles.
16602///
16603/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
16604/// detect any complexity reducing interleaving. If that doesn't help, it uses
16605/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
16606/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
16607/// back together.
16608static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16609 const APInt &Zeroable, SDValue V1, SDValue V2,
16610 const X86Subtarget &Subtarget,
16611 SelectionDAG &DAG) {
16612 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16612, __extension__
__PRETTY_FUNCTION__))
;
16613 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16613, __extension__
__PRETTY_FUNCTION__))
;
16614 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16614, __extension__
__PRETTY_FUNCTION__))
;
16615
16616 // Try to use shift instructions.
16617 if (SDValue Shift =
16618 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
16619 DAG, /*BitwiseOnly*/ false))
16620 return Shift;
16621
16622 // Try to use byte rotation instructions.
16623 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
16624 Subtarget, DAG))
16625 return Rotate;
16626
16627 // Use dedicated pack instructions for masks that match their pattern.
16628 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
16629 Subtarget))
16630 return V;
16631
16632 // Try to use a zext lowering.
16633 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
16634 Zeroable, Subtarget, DAG))
16635 return ZExt;
16636
16637 // Try to use lower using a truncation.
16638 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16639 Subtarget, DAG))
16640 return V;
16641
16642 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16643 Subtarget, DAG))
16644 return V;
16645
16646 // See if we can use SSE4A Extraction / Insertion.
16647 if (Subtarget.hasSSE4A())
16648 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16649 Zeroable, DAG))
16650 return V;
16651
16652 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16653
16654 // For single-input shuffles, there are some nicer lowering tricks we can use.
16655 if (NumV2Elements == 0) {
16656 // Check for being able to broadcast a single element.
16657 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16658 Mask, Subtarget, DAG))
16659 return Broadcast;
16660
16661 // Try to use bit rotation instructions.
16662 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16663 Subtarget, DAG))
16664 return Rotate;
16665
16666 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16667 return V;
16668
16669 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16670 // Notably, this handles splat and partial-splat shuffles more efficiently.
16671 // However, it only makes sense if the pre-duplication shuffle simplifies
16672 // things significantly. Currently, this means we need to be able to
16673 // express the pre-duplication shuffle as an i16 shuffle.
16674 //
16675 // FIXME: We should check for other patterns which can be widened into an
16676 // i16 shuffle as well.
16677 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16678 for (int i = 0; i < 16; i += 2)
16679 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16680 return false;
16681
16682 return true;
16683 };
16684 auto tryToWidenViaDuplication = [&]() -> SDValue {
16685 if (!canWidenViaDuplication(Mask))
16686 return SDValue();
16687 SmallVector<int, 4> LoInputs;
16688 copy_if(Mask, std::back_inserter(LoInputs),
16689 [](int M) { return M >= 0 && M < 8; });
16690 array_pod_sort(LoInputs.begin(), LoInputs.end());
16691 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16692 LoInputs.end());
16693 SmallVector<int, 4> HiInputs;
16694 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16695 array_pod_sort(HiInputs.begin(), HiInputs.end());
16696 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16697 HiInputs.end());
16698
16699 bool TargetLo = LoInputs.size() >= HiInputs.size();
16700 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16701 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16702
16703 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16704 SmallDenseMap<int, int, 8> LaneMap;
16705 for (int I : InPlaceInputs) {
16706 PreDupI16Shuffle[I/2] = I/2;
16707 LaneMap[I] = I;
16708 }
16709 int j = TargetLo ? 0 : 4, je = j + 4;
16710 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16711 // Check if j is already a shuffle of this input. This happens when
16712 // there are two adjacent bytes after we move the low one.
16713 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16714 // If we haven't yet mapped the input, search for a slot into which
16715 // we can map it.
16716 while (j < je && PreDupI16Shuffle[j] >= 0)
16717 ++j;
16718
16719 if (j == je)
16720 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16721 return SDValue();
16722
16723 // Map this input with the i16 shuffle.
16724 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16725 }
16726
16727 // Update the lane map based on the mapping we ended up with.
16728 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16729 }
16730 V1 = DAG.getBitcast(
16731 MVT::v16i8,
16732 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16733 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16734
16735 // Unpack the bytes to form the i16s that will be shuffled into place.
16736 bool EvenInUse = false, OddInUse = false;
16737 for (int i = 0; i < 16; i += 2) {
16738 EvenInUse |= (Mask[i + 0] >= 0);
16739 OddInUse |= (Mask[i + 1] >= 0);
16740 if (EvenInUse && OddInUse)
16741 break;
16742 }
16743 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16744 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16745 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16746
16747 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16748 for (int i = 0; i < 16; ++i)
16749 if (Mask[i] >= 0) {
16750 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16751 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16751, __extension__
__PRETTY_FUNCTION__))
;
16752 if (PostDupI16Shuffle[i / 2] < 0)
16753 PostDupI16Shuffle[i / 2] = MappedMask;
16754 else
16755 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__
__PRETTY_FUNCTION__))
16756 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__
__PRETTY_FUNCTION__))
;
16757 }
16758 return DAG.getBitcast(
16759 MVT::v16i8,
16760 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16761 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16762 };
16763 if (SDValue V = tryToWidenViaDuplication())
16764 return V;
16765 }
16766
16767 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16768 Zeroable, Subtarget, DAG))
16769 return Masked;
16770
16771 // Use dedicated unpack instructions for masks that match their pattern.
16772 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16773 return V;
16774
16775 // Try to use byte shift instructions to mask.
16776 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16777 Zeroable, Subtarget, DAG))
16778 return V;
16779
16780 // Check for compaction patterns.
16781 bool IsSingleInput = V2.isUndef();
16782 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16783
16784 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16785 // with PSHUFB. It is important to do this before we attempt to generate any
16786 // blends but after all of the single-input lowerings. If the single input
16787 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16788 // want to preserve that and we can DAG combine any longer sequences into
16789 // a PSHUFB in the end. But once we start blending from multiple inputs,
16790 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16791 // and there are *very* few patterns that would actually be faster than the
16792 // PSHUFB approach because of its ability to zero lanes.
16793 //
16794 // If the mask is a binary compaction, we can more efficiently perform this
16795 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16796 //
16797 // FIXME: The only exceptions to the above are blends which are exact
16798 // interleavings with direct instructions supporting them. We currently don't
16799 // handle those well here.
16800 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16801 bool V1InUse = false;
16802 bool V2InUse = false;
16803
16804 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16805 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16806
16807 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16808 // do so. This avoids using them to handle blends-with-zero which is
16809 // important as a single pshufb is significantly faster for that.
16810 if (V1InUse && V2InUse) {
16811 if (Subtarget.hasSSE41())
16812 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16813 Zeroable, Subtarget, DAG))
16814 return Blend;
16815
16816 // We can use an unpack to do the blending rather than an or in some
16817 // cases. Even though the or may be (very minorly) more efficient, we
16818 // preference this lowering because there are common cases where part of
16819 // the complexity of the shuffles goes away when we do the final blend as
16820 // an unpack.
16821 // FIXME: It might be worth trying to detect if the unpack-feeding
16822 // shuffles will both be pshufb, in which case we shouldn't bother with
16823 // this.
16824 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16825 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16826 return Unpack;
16827
16828 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16829 if (Subtarget.hasVBMI())
16830 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16831 DAG);
16832
16833 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16834 if (Subtarget.hasXOP()) {
16835 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16836 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16837 }
16838
16839 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16840 // PALIGNR will be cheaper than the second PSHUFB+OR.
16841 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16842 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16843 return V;
16844 }
16845
16846 return PSHUFB;
16847 }
16848
16849 // There are special ways we can lower some single-element blends.
16850 if (NumV2Elements == 1)
16851 if (SDValue V = lowerShuffleAsElementInsertion(
16852 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16853 return V;
16854
16855 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16856 return Blend;
16857
16858 // Check whether a compaction lowering can be done. This handles shuffles
16859 // which take every Nth element for some even N. See the helper function for
16860 // details.
16861 //
16862 // We special case these as they can be particularly efficiently handled with
16863 // the PACKUSB instruction on x86 and they show up in common patterns of
16864 // rearranging bytes to truncate wide elements.
16865 if (NumEvenDrops) {
16866 // NumEvenDrops is the power of two stride of the elements. Another way of
16867 // thinking about it is that we need to drop the even elements this many
16868 // times to get the original input.
16869
16870 // First we need to zero all the dropped bytes.
16871 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__
__PRETTY_FUNCTION__))
16872 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__
__PRETTY_FUNCTION__))
;
16873 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16874 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16875 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16876 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16877 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16878 WordClearMask);
16879 if (!IsSingleInput)
16880 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16881 WordClearMask);
16882
16883 // Now pack things back together.
16884 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16885 IsSingleInput ? V1 : V2);
16886 for (int i = 1; i < NumEvenDrops; ++i) {
16887 Result = DAG.getBitcast(MVT::v8i16, Result);
16888 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16889 }
16890 return Result;
16891 }
16892
16893 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16894 if (NumOddDrops == 1) {
16895 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16896 DAG.getBitcast(MVT::v8i16, V1),
16897 DAG.getTargetConstant(8, DL, MVT::i8));
16898 if (!IsSingleInput)
16899 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16900 DAG.getBitcast(MVT::v8i16, V2),
16901 DAG.getTargetConstant(8, DL, MVT::i8));
16902 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16903 IsSingleInput ? V1 : V2);
16904 }
16905
16906 // Handle multi-input cases by blending/unpacking single-input shuffles.
16907 if (NumV2Elements > 0)
16908 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16909 Subtarget, DAG);
16910
16911 // The fallback path for single-input shuffles widens this into two v8i16
16912 // vectors with unpacks, shuffles those, and then pulls them back together
16913 // with a pack.
16914 SDValue V = V1;
16915
16916 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16917 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16918 for (int i = 0; i < 16; ++i)
16919 if (Mask[i] >= 0)
16920 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16921
16922 SDValue VLoHalf, VHiHalf;
16923 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16924 // them out and avoid using UNPCK{L,H} to extract the elements of V as
16925 // i16s.
16926 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16927 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16928 // Use a mask to drop the high bytes.
16929 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16930 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16931 DAG.getConstant(0x00FF, DL, MVT::v8i16));
16932
16933 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16934 VHiHalf = DAG.getUNDEF(MVT::v8i16);
16935
16936 // Squash the masks to point directly into VLoHalf.
16937 for (int &M : LoBlendMask)
16938 if (M >= 0)
16939 M /= 2;
16940 for (int &M : HiBlendMask)
16941 if (M >= 0)
16942 M /= 2;
16943 } else {
16944 // Otherwise just unpack the low half of V into VLoHalf and the high half into
16945 // VHiHalf so that we can blend them as i16s.
16946 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16947
16948 VLoHalf = DAG.getBitcast(
16949 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16950 VHiHalf = DAG.getBitcast(
16951 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16952 }
16953
16954 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16955 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16956
16957 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16958}
16959
16960/// Dispatching routine to lower various 128-bit x86 vector shuffles.
16961///
16962/// This routine breaks down the specific type of 128-bit shuffle and
16963/// dispatches to the lowering routines accordingly.
16964static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16965 MVT VT, SDValue V1, SDValue V2,
16966 const APInt &Zeroable,
16967 const X86Subtarget &Subtarget,
16968 SelectionDAG &DAG) {
16969 switch (VT.SimpleTy) {
16970 case MVT::v2i64:
16971 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16972 case MVT::v2f64:
16973 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16974 case MVT::v4i32:
16975 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16976 case MVT::v4f32:
16977 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16978 case MVT::v8i16:
16979 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16980 case MVT::v8f16:
16981 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16982 case MVT::v16i8:
16983 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16984
16985 default:
16986 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16986)
;
16987 }
16988}
16989
16990/// Generic routine to split vector shuffle into half-sized shuffles.
16991///
16992/// This routine just extracts two subvectors, shuffles them independently, and
16993/// then concatenates them back together. This should work effectively with all
16994/// AVX vector shuffle types.
16995static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16996 SDValue V2, ArrayRef<int> Mask,
16997 SelectionDAG &DAG, bool SimpleOnly) {
16998 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__
__PRETTY_FUNCTION__))
16999 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__
__PRETTY_FUNCTION__))
;
17000 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17000, __extension__
__PRETTY_FUNCTION__))
;
17001 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17001, __extension__
__PRETTY_FUNCTION__))
;
17002
17003 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
17004 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
17005
17006 int NumElements = VT.getVectorNumElements();
17007 int SplitNumElements = NumElements / 2;
17008 MVT ScalarVT = VT.getVectorElementType();
17009 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
17010
17011 // Use splitVector/extractSubVector so that split build-vectors just build two
17012 // narrower build vectors. This helps shuffling with splats and zeros.
17013 auto SplitVector = [&](SDValue V) {
17014 SDValue LoV, HiV;
17015 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
17016 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
17017 DAG.getBitcast(SplitVT, HiV));
17018 };
17019
17020 SDValue LoV1, HiV1, LoV2, HiV2;
17021 std::tie(LoV1, HiV1) = SplitVector(V1);
17022 std::tie(LoV2, HiV2) = SplitVector(V2);
17023
17024 // Now create two 4-way blends of these half-width vectors.
17025 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
17026 bool &UseHiV1, bool &UseLoV2,
17027 bool &UseHiV2) {
17028 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
17029 for (int i = 0; i < SplitNumElements; ++i) {
17030 int M = HalfMask[i];
17031 if (M >= NumElements) {
17032 if (M >= NumElements + SplitNumElements)
17033 UseHiV2 = true;
17034 else
17035 UseLoV2 = true;
17036 } else if (M >= 0) {
17037 if (M >= SplitNumElements)
17038 UseHiV1 = true;
17039 else
17040 UseLoV1 = true;
17041 }
17042 }
17043 };
17044
17045 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
17046 if (!SimpleOnly)
17047 return true;
17048
17049 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
17050 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
17051
17052 return !(UseHiV1 || UseHiV2);
17053 };
17054
17055 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
17056 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
17057 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
17058 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
17059 for (int i = 0; i < SplitNumElements; ++i) {
17060 int M = HalfMask[i];
17061 if (M >= NumElements) {
17062 V2BlendMask[i] = M - NumElements;
17063 BlendMask[i] = SplitNumElements + i;
17064 } else if (M >= 0) {
17065 V1BlendMask[i] = M;
17066 BlendMask[i] = i;
17067 }
17068 }
17069
17070 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
17071 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
17072
17073 // Because the lowering happens after all combining takes place, we need to
17074 // manually combine these blend masks as much as possible so that we create
17075 // a minimal number of high-level vector shuffle nodes.
17076 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 &&
!UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail
("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17076, __extension__
__PRETTY_FUNCTION__))
;
17077
17078 // First try just blending the halves of V1 or V2.
17079 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
17080 return DAG.getUNDEF(SplitVT);
17081 if (!UseLoV2 && !UseHiV2)
17082 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17083 if (!UseLoV1 && !UseHiV1)
17084 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17085
17086 SDValue V1Blend, V2Blend;
17087 if (UseLoV1 && UseHiV1) {
17088 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17089 } else {
17090 // We only use half of V1 so map the usage down into the final blend mask.
17091 V1Blend = UseLoV1 ? LoV1 : HiV1;
17092 for (int i = 0; i < SplitNumElements; ++i)
17093 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
17094 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
17095 }
17096 if (UseLoV2 && UseHiV2) {
17097 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17098 } else {
17099 // We only use half of V2 so map the usage down into the final blend mask.
17100 V2Blend = UseLoV2 ? LoV2 : HiV2;
17101 for (int i = 0; i < SplitNumElements; ++i)
17102 if (BlendMask[i] >= SplitNumElements)
17103 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
17104 }
17105 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
17106 };
17107
17108 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
17109 return SDValue();
17110
17111 SDValue Lo = HalfBlend(LoMask);
17112 SDValue Hi = HalfBlend(HiMask);
17113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17114}
17115
17116/// Either split a vector in halves or decompose the shuffles and the
17117/// blend/unpack.
17118///
17119/// This is provided as a good fallback for many lowerings of non-single-input
17120/// shuffles with more than one 128-bit lane. In those cases, we want to select
17121/// between splitting the shuffle into 128-bit components and stitching those
17122/// back together vs. extracting the single-input shuffles and blending those
17123/// results.
17124static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
17125 SDValue V2, ArrayRef<int> Mask,
17126 const X86Subtarget &Subtarget,
17127 SelectionDAG &DAG) {
17128 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__
__PRETTY_FUNCTION__))
17129 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__
__PRETTY_FUNCTION__))
;
17130 int Size = Mask.size();
17131
17132 // If this can be modeled as a broadcast of two elements followed by a blend,
17133 // prefer that lowering. This is especially important because broadcasts can
17134 // often fold with memory operands.
17135 auto DoBothBroadcast = [&] {
17136 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
17137 for (int M : Mask)
17138 if (M >= Size) {
17139 if (V2BroadcastIdx < 0)
17140 V2BroadcastIdx = M - Size;
17141 else if (M - Size != V2BroadcastIdx)
17142 return false;
17143 } else if (M >= 0) {
17144 if (V1BroadcastIdx < 0)
17145 V1BroadcastIdx = M;
17146 else if (M != V1BroadcastIdx)
17147 return false;
17148 }
17149 return true;
17150 };
17151 if (DoBothBroadcast())
17152 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17153 DAG);
17154
17155 // If the inputs all stem from a single 128-bit lane of each input, then we
17156 // split them rather than blending because the split will decompose to
17157 // unusually few instructions.
17158 int LaneCount = VT.getSizeInBits() / 128;
17159 int LaneSize = Size / LaneCount;
17160 SmallBitVector LaneInputs[2];
17161 LaneInputs[0].resize(LaneCount, false);
17162 LaneInputs[1].resize(LaneCount, false);
17163 for (int i = 0; i < Size; ++i)
17164 if (Mask[i] >= 0)
17165 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
17166 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
17167 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17168 /*SimpleOnly*/ false);
17169
17170 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
17171 // requires that the decomposed single-input shuffles don't end up here.
17172 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17173 DAG);
17174}
17175
17176// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17177// TODO: Extend to support v8f32 (+ 512-bit shuffles).
17178static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
17179 SDValue V1, SDValue V2,
17180 ArrayRef<int> Mask,
17181 SelectionDAG &DAG) {
17182 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17182, __extension__
__PRETTY_FUNCTION__))
;
17183
17184 int LHSMask[4] = {-1, -1, -1, -1};
17185 int RHSMask[4] = {-1, -1, -1, -1};
17186 unsigned SHUFPMask = 0;
17187
17188 // As SHUFPD uses a single LHS/RHS element per lane, we can always
17189 // perform the shuffle once the lanes have been shuffled in place.
17190 for (int i = 0; i != 4; ++i) {
17191 int M = Mask[i];
17192 if (M < 0)
17193 continue;
17194 int LaneBase = i & ~1;
17195 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
17196 LaneMask[LaneBase + (M & 1)] = M;
17197 SHUFPMask |= (M & 1) << i;
17198 }
17199
17200 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
17201 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
17202 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
17203 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
17204}
17205
17206/// Lower a vector shuffle crossing multiple 128-bit lanes as
17207/// a lane permutation followed by a per-lane permutation.
17208///
17209/// This is mainly for cases where we can have non-repeating permutes
17210/// in each lane.
17211///
17212/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
17213/// we should investigate merging them.
17214static SDValue lowerShuffleAsLanePermuteAndPermute(
17215 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17216 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17217 int NumElts = VT.getVectorNumElements();
17218 int NumLanes = VT.getSizeInBits() / 128;
17219 int NumEltsPerLane = NumElts / NumLanes;
17220 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
17221
17222 /// Attempts to find a sublane permute with the given size
17223 /// that gets all elements into their target lanes.
17224 ///
17225 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
17226 /// If unsuccessful, returns false and may overwrite InLaneMask.
17227 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
17228 int NumSublanesPerLane = NumSublanes / NumLanes;
17229 int NumEltsPerSublane = NumElts / NumSublanes;
17230
17231 SmallVector<int, 16> CrossLaneMask;
17232 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
17233 // CrossLaneMask but one entry == one sublane.
17234 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
17235
17236 for (int i = 0; i != NumElts; ++i) {
17237 int M = Mask[i];
17238 if (M < 0)
17239 continue;
17240
17241 int SrcSublane = M / NumEltsPerSublane;
17242 int DstLane = i / NumEltsPerLane;
17243
17244 // We only need to get the elements into the right lane, not sublane.
17245 // So search all sublanes that make up the destination lane.
17246 bool Found = false;
17247 int DstSubStart = DstLane * NumSublanesPerLane;
17248 int DstSubEnd = DstSubStart + NumSublanesPerLane;
17249 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
17250 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
17251 continue;
17252
17253 Found = true;
17254 CrossLaneMaskLarge[DstSublane] = SrcSublane;
17255 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
17256 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
17257 break;
17258 }
17259 if (!Found)
17260 return SDValue();
17261 }
17262
17263 // Fill CrossLaneMask using CrossLaneMaskLarge.
17264 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
17265
17266 if (!CanUseSublanes) {
17267 // If we're only shuffling a single lowest lane and the rest are identity
17268 // then don't bother.
17269 // TODO - isShuffleMaskInputInPlace could be extended to something like
17270 // this.
17271 int NumIdentityLanes = 0;
17272 bool OnlyShuffleLowestLane = true;
17273 for (int i = 0; i != NumLanes; ++i) {
17274 int LaneOffset = i * NumEltsPerLane;
17275 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
17276 i * NumEltsPerLane))
17277 NumIdentityLanes++;
17278 else if (CrossLaneMask[LaneOffset] != 0)
17279 OnlyShuffleLowestLane = false;
17280 }
17281 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
17282 return SDValue();
17283 }
17284
17285 // Avoid returning the same shuffle operation. For example,
17286 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
17287 // undef:v16i16
17288 if (CrossLaneMask == Mask || InLaneMask == Mask)
17289 return SDValue();
17290
17291 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
17292 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
17293 InLaneMask);
17294 };
17295
17296 // First attempt a solution with full lanes.
17297 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
17298 return V;
17299
17300 // The rest of the solutions use sublanes.
17301 if (!CanUseSublanes)
17302 return SDValue();
17303
17304 // Then attempt a solution with 64-bit sublanes (vpermq).
17305 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
17306 return V;
17307
17308 // If that doesn't work and we have fast variable cross-lane shuffle,
17309 // attempt 32-bit sublanes (vpermd).
17310 if (!Subtarget.hasFastVariableCrossLaneShuffle())
17311 return SDValue();
17312
17313 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
17314}
17315
17316/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
17317static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
17318 SmallVector<int> &InLaneMask) {
17319 int Size = Mask.size();
17320 InLaneMask.assign(Mask.begin(), Mask.end());
17321 for (int i = 0; i < Size; ++i) {
17322 int &M = InLaneMask[i];
17323 if (M < 0)
17324 continue;
17325 if (((M % Size) / LaneSize) != (i / LaneSize))
17326 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17327 }
17328}
17329
17330/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
17331/// source with a lane permutation.
17332///
17333/// This lowering strategy results in four instructions in the worst case for a
17334/// single-input cross lane shuffle which is lower than any other fully general
17335/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
17336/// shuffle pattern should be handled prior to trying this lowering.
17337static SDValue lowerShuffleAsLanePermuteAndShuffle(
17338 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17339 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17340 // FIXME: This should probably be generalized for 512-bit vectors as well.
17341 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17341, __extension__
__PRETTY_FUNCTION__))
;
17342 int Size = Mask.size();
17343 int LaneSize = Size / 2;
17344
17345 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17346 // Only do this if the elements aren't all from the lower lane,
17347 // otherwise we're (probably) better off doing a split.
17348 if (VT == MVT::v4f64 &&
17349 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
17350 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
17351
17352 // If there are only inputs from one 128-bit lane, splitting will in fact be
17353 // less expensive. The flags track whether the given lane contains an element
17354 // that crosses to another lane.
17355 bool AllLanes;
17356 if (!Subtarget.hasAVX2()) {
17357 bool LaneCrossing[2] = {false, false};
17358 for (int i = 0; i < Size; ++i)
17359 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
17360 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
17361 AllLanes = LaneCrossing[0] && LaneCrossing[1];
17362 } else {
17363 bool LaneUsed[2] = {false, false};
17364 for (int i = 0; i < Size; ++i)
17365 if (Mask[i] >= 0)
17366 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
17367 AllLanes = LaneUsed[0] && LaneUsed[1];
17368 }
17369
17370 // TODO - we could support shuffling V2 in the Flipped input.
17371 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__
__PRETTY_FUNCTION__))
17372 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__
__PRETTY_FUNCTION__))
;
17373
17374 SmallVector<int> InLaneMask;
17375 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17376
17377 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__
__PRETTY_FUNCTION__))
17378 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__
__PRETTY_FUNCTION__))
;
17379
17380 // If we're not using both lanes in each lane and the inlane mask is not
17381 // repeating, then we're better off splitting.
17382 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17383 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17384 /*SimpleOnly*/ false);
17385
17386 // Flip the lanes, and shuffle the results which should now be in-lane.
17387 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
17388 SDValue Flipped = DAG.getBitcast(PVT, V1);
17389 Flipped =
17390 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
17391 Flipped = DAG.getBitcast(VT, Flipped);
17392 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
17393}
17394
17395/// Handle lowering 2-lane 128-bit shuffles.
17396static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
17397 SDValue V2, ArrayRef<int> Mask,
17398 const APInt &Zeroable,
17399 const X86Subtarget &Subtarget,
17400 SelectionDAG &DAG) {
17401 if (V2.isUndef()) {
17402 // Attempt to match VBROADCAST*128 subvector broadcast load.
17403 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
17404 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
17405 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
17406 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
17407 MVT MemVT = VT.getHalfNumVectorElementsVT();
17408 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
17409 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
17410 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
17411 VT, MemVT, Ld, Ofs, DAG))
17412 return BcstLd;
17413 }
17414
17415 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
17416 if (Subtarget.hasAVX2())
17417 return SDValue();
17418 }
17419
17420 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
17421
17422 SmallVector<int, 4> WidenedMask;
17423 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
17424 return SDValue();
17425
17426 bool IsLowZero = (Zeroable & 0x3) == 0x3;
17427 bool IsHighZero = (Zeroable & 0xc) == 0xc;
17428
17429 // Try to use an insert into a zero vector.
17430 if (WidenedMask[0] == 0 && IsHighZero) {
17431 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17432 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17433 DAG.getIntPtrConstant(0, DL));
17434 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17435 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17436 DAG.getIntPtrConstant(0, DL));
17437 }
17438
17439 // TODO: If minimizing size and one of the inputs is a zero vector and the
17440 // the zero vector has only one use, we could use a VPERM2X128 to save the
17441 // instruction bytes needed to explicitly generate the zero vector.
17442
17443 // Blends are faster and handle all the non-lane-crossing cases.
17444 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
17445 Subtarget, DAG))
17446 return Blend;
17447
17448 // If either input operand is a zero vector, use VPERM2X128 because its mask
17449 // allows us to replace the zero input with an implicit zero.
17450 if (!IsLowZero && !IsHighZero) {
17451 // Check for patterns which can be matched with a single insert of a 128-bit
17452 // subvector.
17453 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
17454 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
17455
17456 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
17457 // this will likely become vinsertf128 which can't fold a 256-bit memop.
17458 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
17459 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17460 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
17461 OnlyUsesV1 ? V1 : V2,
17462 DAG.getIntPtrConstant(0, DL));
17463 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17464 DAG.getIntPtrConstant(2, DL));
17465 }
17466 }
17467
17468 // Try to use SHUF128 if possible.
17469 if (Subtarget.hasVLX()) {
17470 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
17471 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
17472 ((WidenedMask[1] % 2) << 1);
17473 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
17474 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17475 }
17476 }
17477 }
17478
17479 // Otherwise form a 128-bit permutation. After accounting for undefs,
17480 // convert the 64-bit shuffle mask selection values into 128-bit
17481 // selection bits by dividing the indexes by 2 and shifting into positions
17482 // defined by a vperm2*128 instruction's immediate control byte.
17483
17484 // The immediate permute control byte looks like this:
17485 // [1:0] - select 128 bits from sources for low half of destination
17486 // [2] - ignore
17487 // [3] - zero low half of destination
17488 // [5:4] - select 128 bits from sources for high half of destination
17489 // [6] - ignore
17490 // [7] - zero high half of destination
17491
17492 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__
__PRETTY_FUNCTION__))
17493 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__
__PRETTY_FUNCTION__))
;
17494
17495 unsigned PermMask = 0;
17496 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
17497 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
17498
17499 // Check the immediate mask and replace unused sources with undef.
17500 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
17501 V1 = DAG.getUNDEF(VT);
17502 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
17503 V2 = DAG.getUNDEF(VT);
17504
17505 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
17506 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17507}
17508
17509/// Lower a vector shuffle by first fixing the 128-bit lanes and then
17510/// shuffling each lane.
17511///
17512/// This attempts to create a repeated lane shuffle where each lane uses one
17513/// or two of the lanes of the inputs. The lanes of the input vectors are
17514/// shuffled in one or two independent shuffles to get the lanes into the
17515/// position needed by the final shuffle.
17516static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
17517 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17518 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17519 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17519, __extension__
__PRETTY_FUNCTION__))
;
17520
17521 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17522 return SDValue();
17523
17524 int NumElts = Mask.size();
17525 int NumLanes = VT.getSizeInBits() / 128;
17526 int NumLaneElts = 128 / VT.getScalarSizeInBits();
17527 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
17528 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
17529
17530 // First pass will try to fill in the RepeatMask from lanes that need two
17531 // sources.
17532 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17533 int Srcs[2] = {-1, -1};
17534 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
17535 for (int i = 0; i != NumLaneElts; ++i) {
17536 int M = Mask[(Lane * NumLaneElts) + i];
17537 if (M < 0)
17538 continue;
17539 // Determine which of the possible input lanes (NumLanes from each source)
17540 // this element comes from. Assign that as one of the sources for this
17541 // lane. We can assign up to 2 sources for this lane. If we run out
17542 // sources we can't do anything.
17543 int LaneSrc = M / NumLaneElts;
17544 int Src;
17545 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
17546 Src = 0;
17547 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
17548 Src = 1;
17549 else
17550 return SDValue();
17551
17552 Srcs[Src] = LaneSrc;
17553 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
17554 }
17555
17556 // If this lane has two sources, see if it fits with the repeat mask so far.
17557 if (Srcs[1] < 0)
17558 continue;
17559
17560 LaneSrcs[Lane][0] = Srcs[0];
17561 LaneSrcs[Lane][1] = Srcs[1];
17562
17563 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
17564 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17564, __extension__
__PRETTY_FUNCTION__))
;
17565 for (int i = 0, e = M1.size(); i != e; ++i)
17566 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
17567 return false;
17568 return true;
17569 };
17570
17571 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
17572 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17572, __extension__
__PRETTY_FUNCTION__))
;
17573 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
17574 int M = Mask[i];
17575 if (M < 0)
17576 continue;
17577 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__
__PRETTY_FUNCTION__))
17578 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__
__PRETTY_FUNCTION__))
;
17579 MergedMask[i] = M;
17580 }
17581 };
17582
17583 if (MatchMasks(InLaneMask, RepeatMask)) {
17584 // Merge this lane mask into the final repeat mask.
17585 MergeMasks(InLaneMask, RepeatMask);
17586 continue;
17587 }
17588
17589 // Didn't find a match. Swap the operands and try again.
17590 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
17591 ShuffleVectorSDNode::commuteMask(InLaneMask);
17592
17593 if (MatchMasks(InLaneMask, RepeatMask)) {
17594 // Merge this lane mask into the final repeat mask.
17595 MergeMasks(InLaneMask, RepeatMask);
17596 continue;
17597 }
17598
17599 // Couldn't find a match with the operands in either order.
17600 return SDValue();
17601 }
17602
17603 // Now handle any lanes with only one source.
17604 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17605 // If this lane has already been processed, skip it.
17606 if (LaneSrcs[Lane][0] >= 0)
17607 continue;
17608
17609 for (int i = 0; i != NumLaneElts; ++i) {
17610 int M = Mask[(Lane * NumLaneElts) + i];
17611 if (M < 0)
17612 continue;
17613
17614 // If RepeatMask isn't defined yet we can define it ourself.
17615 if (RepeatMask[i] < 0)
17616 RepeatMask[i] = M % NumLaneElts;
17617
17618 if (RepeatMask[i] < NumElts) {
17619 if (RepeatMask[i] != M % NumLaneElts)
17620 return SDValue();
17621 LaneSrcs[Lane][0] = M / NumLaneElts;
17622 } else {
17623 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
17624 return SDValue();
17625 LaneSrcs[Lane][1] = M / NumLaneElts;
17626 }
17627 }
17628
17629 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
17630 return SDValue();
17631 }
17632
17633 SmallVector<int, 16> NewMask(NumElts, -1);
17634 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17635 int Src = LaneSrcs[Lane][0];
17636 for (int i = 0; i != NumLaneElts; ++i) {
17637 int M = -1;
17638 if (Src >= 0)
17639 M = Src * NumLaneElts + i;
17640 NewMask[Lane * NumLaneElts + i] = M;
17641 }
17642 }
17643 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17644 // Ensure we didn't get back the shuffle we started with.
17645 // FIXME: This is a hack to make up for some splat handling code in
17646 // getVectorShuffle.
17647 if (isa<ShuffleVectorSDNode>(NewV1) &&
17648 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
17649 return SDValue();
17650
17651 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17652 int Src = LaneSrcs[Lane][1];
17653 for (int i = 0; i != NumLaneElts; ++i) {
17654 int M = -1;
17655 if (Src >= 0)
17656 M = Src * NumLaneElts + i;
17657 NewMask[Lane * NumLaneElts + i] = M;
17658 }
17659 }
17660 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17661 // Ensure we didn't get back the shuffle we started with.
17662 // FIXME: This is a hack to make up for some splat handling code in
17663 // getVectorShuffle.
17664 if (isa<ShuffleVectorSDNode>(NewV2) &&
17665 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
17666 return SDValue();
17667
17668 for (int i = 0; i != NumElts; ++i) {
17669 if (Mask[i] < 0) {
17670 NewMask[i] = -1;
17671 continue;
17672 }
17673 NewMask[i] = RepeatMask[i % NumLaneElts];
17674 if (NewMask[i] < 0)
17675 continue;
17676
17677 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
17678 }
17679 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
17680}
17681
17682/// If the input shuffle mask results in a vector that is undefined in all upper
17683/// or lower half elements and that mask accesses only 2 halves of the
17684/// shuffle's operands, return true. A mask of half the width with mask indexes
17685/// adjusted to access the extracted halves of the original shuffle operands is
17686/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17687/// lower half of each input operand is accessed.
17688static bool
17689getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17690 int &HalfIdx1, int &HalfIdx2) {
17691 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__
__PRETTY_FUNCTION__))
17692 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__
__PRETTY_FUNCTION__))
;
17693
17694 // Exactly one half of the result must be undef to allow narrowing.
17695 bool UndefLower = isUndefLowerHalf(Mask);
17696 bool UndefUpper = isUndefUpperHalf(Mask);
17697 if (UndefLower == UndefUpper)
17698 return false;
17699
17700 unsigned HalfNumElts = HalfMask.size();
17701 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17702 HalfIdx1 = -1;
17703 HalfIdx2 = -1;
17704 for (unsigned i = 0; i != HalfNumElts; ++i) {
17705 int M = Mask[i + MaskIndexOffset];
17706 if (M < 0) {
17707 HalfMask[i] = M;
17708 continue;
17709 }
17710
17711 // Determine which of the 4 half vectors this element is from.
17712 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17713 int HalfIdx = M / HalfNumElts;
17714
17715 // Determine the element index into its half vector source.
17716 int HalfElt = M % HalfNumElts;
17717
17718 // We can shuffle with up to 2 half vectors, set the new 'half'
17719 // shuffle mask accordingly.
17720 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17721 HalfMask[i] = HalfElt;
17722 HalfIdx1 = HalfIdx;
17723 continue;
17724 }
17725 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17726 HalfMask[i] = HalfElt + HalfNumElts;
17727 HalfIdx2 = HalfIdx;
17728 continue;
17729 }
17730
17731 // Too many half vectors referenced.
17732 return false;
17733 }
17734
17735 return true;
17736}
17737
17738/// Given the output values from getHalfShuffleMask(), create a half width
17739/// shuffle of extracted vectors followed by an insert back to full width.
17740static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17741 ArrayRef<int> HalfMask, int HalfIdx1,
17742 int HalfIdx2, bool UndefLower,
17743 SelectionDAG &DAG, bool UseConcat = false) {
17744 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17744, __extension__
__PRETTY_FUNCTION__))
;
17745 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17745, __extension__
__PRETTY_FUNCTION__))
;
17746
17747 MVT VT = V1.getSimpleValueType();
17748 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17749 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17750
17751 auto getHalfVector = [&](int HalfIdx) {
17752 if (HalfIdx < 0)
17753 return DAG.getUNDEF(HalfVT);
17754 SDValue V = (HalfIdx < 2 ? V1 : V2);
17755 HalfIdx = (HalfIdx % 2) * HalfNumElts;
17756 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17757 DAG.getIntPtrConstant(HalfIdx, DL));
17758 };
17759
17760 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17761 SDValue Half1 = getHalfVector(HalfIdx1);
17762 SDValue Half2 = getHalfVector(HalfIdx2);
17763 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17764 if (UseConcat) {
17765 SDValue Op0 = V;
17766 SDValue Op1 = DAG.getUNDEF(HalfVT);
17767 if (UndefLower)
17768 std::swap(Op0, Op1);
17769 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17770 }
17771
17772 unsigned Offset = UndefLower ? HalfNumElts : 0;
17773 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17774 DAG.getIntPtrConstant(Offset, DL));
17775}
17776
17777/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17778/// This allows for fast cases such as subvector extraction/insertion
17779/// or shuffling smaller vector types which can lower more efficiently.
17780static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17781 SDValue V2, ArrayRef<int> Mask,
17782 const X86Subtarget &Subtarget,
17783 SelectionDAG &DAG) {
17784 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__
__PRETTY_FUNCTION__))
17785 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__
__PRETTY_FUNCTION__))
;
17786
17787 bool UndefLower = isUndefLowerHalf(Mask);
17788 if (!UndefLower && !isUndefUpperHalf(Mask))
17789 return SDValue();
17790
17791 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__
__PRETTY_FUNCTION__))
17792 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__
__PRETTY_FUNCTION__))
;
17793
17794 // Upper half is undef and lower half is whole upper subvector.
17795 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17796 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17797 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17798 if (!UndefLower &&
17799 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17800 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17801 DAG.getIntPtrConstant(HalfNumElts, DL));
17802 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17803 DAG.getIntPtrConstant(0, DL));
17804 }
17805
17806 // Lower half is undef and upper half is whole lower subvector.
17807 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17808 if (UndefLower &&
17809 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17810 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17811 DAG.getIntPtrConstant(0, DL));
17812 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17813 DAG.getIntPtrConstant(HalfNumElts, DL));
17814 }
17815
17816 int HalfIdx1, HalfIdx2;
17817 SmallVector<int, 8> HalfMask(HalfNumElts);
17818 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17819 return SDValue();
17820
17821 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17821, __extension__
__PRETTY_FUNCTION__))
;
17822
17823 // Only shuffle the halves of the inputs when useful.
17824 unsigned NumLowerHalves =
17825 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17826 unsigned NumUpperHalves =
17827 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17828 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17828, __extension__
__PRETTY_FUNCTION__))
;
17829
17830 // Determine the larger pattern of undef/halves, then decide if it's worth
17831 // splitting the shuffle based on subtarget capabilities and types.
17832 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17833 if (!UndefLower) {
17834 // XXXXuuuu: no insert is needed.
17835 // Always extract lowers when setting lower - these are all free subreg ops.
17836 if (NumUpperHalves == 0)
17837 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17838 UndefLower, DAG);
17839
17840 if (NumUpperHalves == 1) {
17841 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17842 if (Subtarget.hasAVX2()) {
17843 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17844 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17845 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
17846 (!isSingleSHUFPSMask(HalfMask) ||
17847 Subtarget.hasFastVariableCrossLaneShuffle()))
17848 return SDValue();
17849 // If this is a unary shuffle (assume that the 2nd operand is
17850 // canonicalized to undef), then we can use vpermpd. Otherwise, we
17851 // are better off extracting the upper half of 1 operand and using a
17852 // narrow shuffle.
17853 if (EltWidth == 64 && V2.isUndef())
17854 return SDValue();
17855 }
17856 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17857 if (Subtarget.hasAVX512() && VT.is512BitVector())
17858 return SDValue();
17859 // Extract + narrow shuffle is better than the wide alternative.
17860 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17861 UndefLower, DAG);
17862 }
17863
17864 // Don't extract both uppers, instead shuffle and then extract.
17865 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17865, __extension__
__PRETTY_FUNCTION__))
;
17866 return SDValue();
17867 }
17868
17869 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17870 if (NumUpperHalves == 0) {
17871 // AVX2 has efficient 64-bit element cross-lane shuffles.
17872 // TODO: Refine to account for unary shuffle, splat, and other masks?
17873 if (Subtarget.hasAVX2() && EltWidth == 64)
17874 return SDValue();
17875 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17876 if (Subtarget.hasAVX512() && VT.is512BitVector())
17877 return SDValue();
17878 // Narrow shuffle + insert is better than the wide alternative.
17879 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17880 UndefLower, DAG);
17881 }
17882
17883 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17884 return SDValue();
17885}
17886
17887/// Handle case where shuffle sources are coming from the same 128-bit lane and
17888/// every lane can be represented as the same repeating mask - allowing us to
17889/// shuffle the sources with the repeating shuffle and then permute the result
17890/// to the destination lanes.
17891static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17892 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17893 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17894 int NumElts = VT.getVectorNumElements();
17895 int NumLanes = VT.getSizeInBits() / 128;
17896 int NumLaneElts = NumElts / NumLanes;
17897
17898 // On AVX2 we may be able to just shuffle the lowest elements and then
17899 // broadcast the result.
17900 if (Subtarget.hasAVX2()) {
17901 for (unsigned BroadcastSize : {16, 32, 64}) {
17902 if (BroadcastSize <= VT.getScalarSizeInBits())
17903 continue;
17904 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17905
17906 // Attempt to match a repeating pattern every NumBroadcastElts,
17907 // accounting for UNDEFs but only references the lowest 128-bit
17908 // lane of the inputs.
17909 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17910 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17911 for (int j = 0; j != NumBroadcastElts; ++j) {
17912 int M = Mask[i + j];
17913 if (M < 0)
17914 continue;
17915 int &R = RepeatMask[j];
17916 if (0 != ((M % NumElts) / NumLaneElts))
17917 return false;
17918 if (0 <= R && R != M)
17919 return false;
17920 R = M;
17921 }
17922 return true;
17923 };
17924
17925 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17926 if (!FindRepeatingBroadcastMask(RepeatMask))
17927 continue;
17928
17929 // Shuffle the (lowest) repeated elements in place for broadcast.
17930 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17931
17932 // Shuffle the actual broadcast.
17933 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17934 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17935 for (int j = 0; j != NumBroadcastElts; ++j)
17936 BroadcastMask[i + j] = j;
17937 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17938 BroadcastMask);
17939 }
17940 }
17941
17942 // Bail if the shuffle mask doesn't cross 128-bit lanes.
17943 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17944 return SDValue();
17945
17946 // Bail if we already have a repeated lane shuffle mask.
17947 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17948 return SDValue();
17949
17950 // Helper to look for repeated mask in each split sublane, and that those
17951 // sublanes can then be permuted into place.
17952 auto ShuffleSubLanes = [&](int SubLaneScale) {
17953 int NumSubLanes = NumLanes * SubLaneScale;
17954 int NumSubLaneElts = NumLaneElts / SubLaneScale;
17955
17956 // Check that all the sources are coming from the same lane and see if we
17957 // can form a repeating shuffle mask (local to each sub-lane). At the same
17958 // time, determine the source sub-lane for each destination sub-lane.
17959 int TopSrcSubLane = -1;
17960 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17961 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
17962 SubLaneScale,
17963 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
17964
17965 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17966 // Extract the sub-lane mask, check that it all comes from the same lane
17967 // and normalize the mask entries to come from the first lane.
17968 int SrcLane = -1;
17969 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17970 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17971 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17972 if (M < 0)
17973 continue;
17974 int Lane = (M % NumElts) / NumLaneElts;
17975 if ((0 <= SrcLane) && (SrcLane != Lane))
17976 return SDValue();
17977 SrcLane = Lane;
17978 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17979 SubLaneMask[Elt] = LocalM;
17980 }
17981
17982 // Whole sub-lane is UNDEF.
17983 if (SrcLane < 0)
17984 continue;
17985
17986 // Attempt to match against the candidate repeated sub-lane masks.
17987 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17988 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17989 for (int i = 0; i != NumSubLaneElts; ++i) {
17990 if (M1[i] < 0 || M2[i] < 0)
17991 continue;
17992 if (M1[i] != M2[i])
17993 return false;
17994 }
17995 return true;
17996 };
17997
17998 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17999 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
18000 continue;
18001
18002 // Merge the sub-lane mask into the matching repeated sub-lane mask.
18003 for (int i = 0; i != NumSubLaneElts; ++i) {
18004 int M = SubLaneMask[i];
18005 if (M < 0)
18006 continue;
18007 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__
__PRETTY_FUNCTION__))
18008 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__
__PRETTY_FUNCTION__))
;
18009 RepeatedSubLaneMask[i] = M;
18010 }
18011
18012 // Track the top most source sub-lane - by setting the remaining to
18013 // UNDEF we can greatly simplify shuffle matching.
18014 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
18015 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
18016 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
18017 break;
18018 }
18019
18020 // Bail if we failed to find a matching repeated sub-lane mask.
18021 if (Dst2SrcSubLanes[DstSubLane] < 0)
18022 return SDValue();
18023 }
18024 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__
__PRETTY_FUNCTION__))
18025 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__
__PRETTY_FUNCTION__))
;
18026
18027 // Create a repeating shuffle mask for the entire vector.
18028 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
18029 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
18030 int Lane = SubLane / SubLaneScale;
18031 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
18032 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
18033 int M = RepeatedSubLaneMask[Elt];
18034 if (M < 0)
18035 continue;
18036 int Idx = (SubLane * NumSubLaneElts) + Elt;
18037 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
18038 }
18039 }
18040
18041 // Shuffle each source sub-lane to its destination.
18042 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
18043 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
18044 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
18045 if (SrcSubLane < 0)
18046 continue;
18047 for (int j = 0; j != NumSubLaneElts; ++j)
18048 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
18049 }
18050
18051 // Avoid returning the same shuffle operation.
18052 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
18053 if (RepeatedMask == Mask || SubLaneMask == Mask)
18054 return SDValue();
18055
18056 SDValue RepeatedShuffle =
18057 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
18058
18059 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
18060 SubLaneMask);
18061 };
18062
18063 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
18064 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
18065 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
18066 // Otherwise we can only permute whole 128-bit lanes.
18067 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
18068 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
18069 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
18070 MinSubLaneScale = 2;
18071 MaxSubLaneScale =
18072 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
18073 }
18074 if (Subtarget.hasBWI() && VT == MVT::v64i8)
18075 MinSubLaneScale = MaxSubLaneScale = 4;
18076
18077 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
18078 if (SDValue Shuffle = ShuffleSubLanes(Scale))
18079 return Shuffle;
18080
18081 return SDValue();
18082}
18083
18084static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
18085 bool &ForceV1Zero, bool &ForceV2Zero,
18086 unsigned &ShuffleImm, ArrayRef<int> Mask,
18087 const APInt &Zeroable) {
18088 int NumElts = VT.getVectorNumElements();
18089 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))
18090 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))
18091 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))
;
18092 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__
__PRETTY_FUNCTION__))
18093 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__
__PRETTY_FUNCTION__))
;
18094
18095 bool ZeroLane[2] = { true, true };
18096 for (int i = 0; i < NumElts; ++i)
18097 ZeroLane[i & 1] &= Zeroable[i];
18098
18099 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
18100 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
18101 ShuffleImm = 0;
18102 bool ShufpdMask = true;
18103 bool CommutableMask = true;
18104 for (int i = 0; i < NumElts; ++i) {
18105 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
18106 continue;
18107 if (Mask[i] < 0)
18108 return false;
18109 int Val = (i & 6) + NumElts * (i & 1);
18110 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
18111 if (Mask[i] < Val || Mask[i] > Val + 1)
18112 ShufpdMask = false;
18113 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
18114 CommutableMask = false;
18115 ShuffleImm |= (Mask[i] % 2) << i;
18116 }
18117
18118 if (!ShufpdMask && !CommutableMask)
18119 return false;
18120
18121 if (!ShufpdMask && CommutableMask)
18122 std::swap(V1, V2);
18123
18124 ForceV1Zero = ZeroLane[0];
18125 ForceV2Zero = ZeroLane[1];
18126 return true;
18127}
18128
18129static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
18130 SDValue V2, ArrayRef<int> Mask,
18131 const APInt &Zeroable,
18132 const X86Subtarget &Subtarget,
18133 SelectionDAG &DAG) {
18134 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__
__PRETTY_FUNCTION__))
18135 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__
__PRETTY_FUNCTION__))
;
18136
18137 unsigned Immediate = 0;
18138 bool ForceV1Zero = false, ForceV2Zero = false;
18139 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
18140 Mask, Zeroable))
18141 return SDValue();
18142
18143 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
18144 if (ForceV1Zero)
18145 V1 = getZeroVector(VT, Subtarget, DAG, DL);
18146 if (ForceV2Zero)
18147 V2 = getZeroVector(VT, Subtarget, DAG, DL);
18148
18149 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
18150 DAG.getTargetConstant(Immediate, DL, MVT::i8));
18151}
18152
18153// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18154// by zeroable elements in the remaining 24 elements. Turn this into two
18155// vmovqb instructions shuffled together.
18156static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
18157 SDValue V1, SDValue V2,
18158 ArrayRef<int> Mask,
18159 const APInt &Zeroable,
18160 SelectionDAG &DAG) {
18161 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18161, __extension__
__PRETTY_FUNCTION__))
;
18162
18163 // The first 8 indices should be every 8th element.
18164 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
18165 return SDValue();
18166
18167 // Remaining elements need to be zeroable.
18168 if (Zeroable.countl_one() < (Mask.size() - 8))
18169 return SDValue();
18170
18171 V1 = DAG.getBitcast(MVT::v4i64, V1);
18172 V2 = DAG.getBitcast(MVT::v4i64, V2);
18173
18174 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
18175 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
18176
18177 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
18178 // the upper bits of the result using an unpckldq.
18179 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
18180 { 0, 1, 2, 3, 16, 17, 18, 19,
18181 4, 5, 6, 7, 20, 21, 22, 23 });
18182 // Insert the unpckldq into a zero vector to widen to v32i8.
18183 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
18184 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
18185 DAG.getIntPtrConstant(0, DL));
18186}
18187
18188// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
18189// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
18190// =>
18191// ul = unpckl v1, v2
18192// uh = unpckh v1, v2
18193// a = vperm ul, uh
18194// b = vperm ul, uh
18195//
18196// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
18197// and permute. We cannot directly match v3 because it is split into two
18198// 256-bit vectors in earlier isel stages. Therefore, this function matches a
18199// pair of 256-bit shuffles and makes sure the masks are consecutive.
18200//
18201// Once unpck and permute nodes are created, the permute corresponding to this
18202// shuffle is returned, while the other permute replaces the other half of the
18203// shuffle in the selection dag.
18204static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
18205 SDValue V1, SDValue V2,
18206 ArrayRef<int> Mask,
18207 SelectionDAG &DAG) {
18208 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
18209 VT != MVT::v32i8)
18210 return SDValue();
18211 // <B0, B1, B0+1, B1+1, ..., >
18212 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
18213 unsigned Begin1) {
18214 size_t Size = Mask.size();
18215 assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18215, __extension__
__PRETTY_FUNCTION__))
;
18216 for (unsigned I = 0; I < Size; I += 2) {
18217 if (Mask[I] != (int)(Begin0 + I / 2) ||
18218 Mask[I + 1] != (int)(Begin1 + I / 2))
18219 return false;
18220 }
18221 return true;
18222 };
18223 // Check which half is this shuffle node
18224 int NumElts = VT.getVectorNumElements();
18225 size_t FirstQtr = NumElts / 2;
18226 size_t ThirdQtr = NumElts + NumElts / 2;
18227 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
18228 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
18229 if (!IsFirstHalf && !IsSecondHalf)
18230 return SDValue();
18231
18232 // Find the intersection between shuffle users of V1 and V2.
18233 SmallVector<SDNode *, 2> Shuffles;
18234 for (SDNode *User : V1->uses())
18235 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
18236 User->getOperand(1) == V2)
18237 Shuffles.push_back(User);
18238 // Limit user size to two for now.
18239 if (Shuffles.size() != 2)
18240 return SDValue();
18241 // Find out which half of the 512-bit shuffles is each smaller shuffle
18242 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
18243 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
18244 SDNode *FirstHalf;
18245 SDNode *SecondHalf;
18246 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
18247 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
18248 FirstHalf = Shuffles[0];
18249 SecondHalf = Shuffles[1];
18250 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
18251 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
18252 FirstHalf = Shuffles[1];
18253 SecondHalf = Shuffles[0];
18254 } else {
18255 return SDValue();
18256 }
18257 // Lower into unpck and perm. Return the perm of this shuffle and replace
18258 // the other.
18259 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
18260 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
18261 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18262 DAG.getTargetConstant(0x20, DL, MVT::i8));
18263 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18264 DAG.getTargetConstant(0x31, DL, MVT::i8));
18265 if (IsFirstHalf) {
18266 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
18267 return Perm1;
18268 }
18269 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
18270 return Perm2;
18271}
18272
18273/// Handle lowering of 4-lane 64-bit floating point shuffles.
18274///
18275/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
18276/// isn't available.
18277static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18278 const APInt &Zeroable, SDValue V1, SDValue V2,
18279 const X86Subtarget &Subtarget,
18280 SelectionDAG &DAG) {
18281 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18281, __extension__
__PRETTY_FUNCTION__))
;
18282 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18282, __extension__
__PRETTY_FUNCTION__))
;
18283 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18283, __extension__
__PRETTY_FUNCTION__))
;
18284
18285 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
18286 Subtarget, DAG))
18287 return V;
18288
18289 if (V2.isUndef()) {
18290 // Check for being able to broadcast a single element.
18291 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
18292 Mask, Subtarget, DAG))
18293 return Broadcast;
18294
18295 // Use low duplicate instructions for masks that match their pattern.
18296 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
18297 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
18298
18299 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
18300 // Non-half-crossing single input shuffles can be lowered with an
18301 // interleaved permutation.
18302 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18303 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
18304 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
18305 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18306 }
18307
18308 // With AVX2 we have direct support for this permutation.
18309 if (Subtarget.hasAVX2())
18310 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
18311 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18312
18313 // Try to create an in-lane repeating shuffle mask and then shuffle the
18314 // results into the target lanes.
18315 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18316 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18317 return V;
18318
18319 // Try to permute the lanes and then use a per-lane permute.
18320 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
18321 Mask, DAG, Subtarget))
18322 return V;
18323
18324 // Otherwise, fall back.
18325 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
18326 DAG, Subtarget);
18327 }
18328
18329 // Use dedicated unpack instructions for masks that match their pattern.
18330 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
18331 return V;
18332
18333 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
18334 Zeroable, Subtarget, DAG))
18335 return Blend;
18336
18337 // Check if the blend happens to exactly fit that of SHUFPD.
18338 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
18339 Zeroable, Subtarget, DAG))
18340 return Op;
18341
18342 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18343 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18344
18345 // If we have lane crossing shuffles AND they don't all come from the lower
18346 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
18347 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
18348 // canonicalize to a blend of splat which isn't necessary for this combine.
18349 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
18350 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
18351 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
18352 (V2.getOpcode() != ISD::BUILD_VECTOR))
18353 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
18354
18355 // If we have one input in place, then we can permute the other input and
18356 // blend the result.
18357 if (V1IsInPlace || V2IsInPlace)
18358 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18359 Subtarget, DAG);
18360
18361 // Try to create an in-lane repeating shuffle mask and then shuffle the
18362 // results into the target lanes.
18363 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18364 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18365 return V;
18366
18367 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18368 // shuffle. However, if we have AVX2 and either inputs are already in place,
18369 // we will be able to shuffle even across lanes the other input in a single
18370 // instruction so skip this pattern.
18371 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
18372 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
18373 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18374 return V;
18375
18376 // If we have VLX support, we can use VEXPAND.
18377 if (Subtarget.hasVLX())
18378 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
18379 DAG, Subtarget))
18380 return V;
18381
18382 // If we have AVX2 then we always want to lower with a blend because an v4 we
18383 // can fully permute the elements.
18384 if (Subtarget.hasAVX2())
18385 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18386 Subtarget, DAG);
18387
18388 // Otherwise fall back on generic lowering.
18389 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
18390 Subtarget, DAG);
18391}
18392
18393/// Handle lowering of 4-lane 64-bit integer shuffles.
18394///
18395/// This routine is only called when we have AVX2 and thus a reasonable
18396/// instruction set for v4i64 shuffling..
18397static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18398 const APInt &Zeroable, SDValue V1, SDValue V2,
18399 const X86Subtarget &Subtarget,
18400 SelectionDAG &DAG) {
18401 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18401, __extension__
__PRETTY_FUNCTION__))
;
18402 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18402, __extension__
__PRETTY_FUNCTION__))
;
18403 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18403, __extension__
__PRETTY_FUNCTION__))
;
18404 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18404, __extension__
__PRETTY_FUNCTION__))
;
18405
18406 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18407 Subtarget, DAG))
18408 return V;
18409
18410 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
18411 Zeroable, Subtarget, DAG))
18412 return Blend;
18413
18414 // Check for being able to broadcast a single element.
18415 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
18416 Subtarget, DAG))
18417 return Broadcast;
18418
18419 // Try to use shift instructions if fast.
18420 if (Subtarget.preferLowerShuffleAsShift())
18421 if (SDValue Shift =
18422 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18423 Subtarget, DAG, /*BitwiseOnly*/ true))
18424 return Shift;
18425
18426 if (V2.isUndef()) {
18427 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18428 // can use lower latency instructions that will operate on both lanes.
18429 SmallVector<int, 2> RepeatedMask;
18430 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
18431 SmallVector<int, 4> PSHUFDMask;
18432 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
18433 return DAG.getBitcast(
18434 MVT::v4i64,
18435 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
18436 DAG.getBitcast(MVT::v8i32, V1),
18437 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18438 }
18439
18440 // AVX2 provides a direct instruction for permuting a single input across
18441 // lanes.
18442 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
18443 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18444 }
18445
18446 // Try to use shift instructions.
18447 if (SDValue Shift =
18448 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
18449 DAG, /*BitwiseOnly*/ false))
18450 return Shift;
18451
18452 // If we have VLX support, we can use VALIGN or VEXPAND.
18453 if (Subtarget.hasVLX()) {
18454 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
18455 Subtarget, DAG))
18456 return Rotate;
18457
18458 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
18459 DAG, Subtarget))
18460 return V;
18461 }
18462
18463 // Try to use PALIGNR.
18464 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
18465 Subtarget, DAG))
18466 return Rotate;
18467
18468 // Use dedicated unpack instructions for masks that match their pattern.
18469 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
18470 return V;
18471
18472 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18473 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18474
18475 // If we have one input in place, then we can permute the other input and
18476 // blend the result.
18477 if (V1IsInPlace || V2IsInPlace)
18478 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18479 Subtarget, DAG);
18480
18481 // Try to create an in-lane repeating shuffle mask and then shuffle the
18482 // results into the target lanes.
18483 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18484 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18485 return V;
18486
18487 // Try to lower to PERMQ(BLENDD(V1,V2)).
18488 if (SDValue V =
18489 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
18490 return V;
18491
18492 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18493 // shuffle. However, if we have AVX2 and either inputs are already in place,
18494 // we will be able to shuffle even across lanes the other input in a single
18495 // instruction so skip this pattern.
18496 if (!V1IsInPlace && !V2IsInPlace)
18497 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18498 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18499 return Result;
18500
18501 // Otherwise fall back on generic blend lowering.
18502 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18503 Subtarget, DAG);
18504}
18505
18506/// Handle lowering of 8-lane 32-bit floating point shuffles.
18507///
18508/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
18509/// isn't available.
18510static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18511 const APInt &Zeroable, SDValue V1, SDValue V2,
18512 const X86Subtarget &Subtarget,
18513 SelectionDAG &DAG) {
18514 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18514, __extension__
__PRETTY_FUNCTION__))
;
18515 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18515, __extension__
__PRETTY_FUNCTION__))
;
18516 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18516, __extension__
__PRETTY_FUNCTION__))
;
18517
18518 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
18519 Zeroable, Subtarget, DAG))
18520 return Blend;
18521
18522 // Check for being able to broadcast a single element.
18523 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
18524 Subtarget, DAG))
18525 return Broadcast;
18526
18527 if (!Subtarget.hasAVX2()) {
18528 SmallVector<int> InLaneMask;
18529 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
18530
18531 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
18532 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
18533 /*SimpleOnly*/ true))
18534 return R;
18535 }
18536 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18537 Zeroable, Subtarget, DAG))
18538 return DAG.getBitcast(MVT::v8f32, ZExt);
18539
18540 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18541 // options to efficiently lower the shuffle.
18542 SmallVector<int, 4> RepeatedMask;
18543 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
18544 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__
__PRETTY_FUNCTION__))
18545 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__
__PRETTY_FUNCTION__))
;
18546
18547 // Use even/odd duplicate instructions for masks that match their pattern.
18548 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18549 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
18550 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18551 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
18552
18553 if (V2.isUndef())
18554 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
18555 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18556
18557 // Use dedicated unpack instructions for masks that match their pattern.
18558 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
18559 return V;
18560
18561 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
18562 // have already handled any direct blends.
18563 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
18564 }
18565
18566 // Try to create an in-lane repeating shuffle mask and then shuffle the
18567 // results into the target lanes.
18568 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18569 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18570 return V;
18571
18572 // If we have a single input shuffle with different shuffle patterns in the
18573 // two 128-bit lanes use the variable mask to VPERMILPS.
18574 if (V2.isUndef()) {
18575 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
18576 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18577 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
18578 }
18579 if (Subtarget.hasAVX2()) {
18580 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18581 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
18582 }
18583 // Otherwise, fall back.
18584 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
18585 DAG, Subtarget);
18586 }
18587
18588 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18589 // shuffle.
18590 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18591 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18592 return Result;
18593
18594 // If we have VLX support, we can use VEXPAND.
18595 if (Subtarget.hasVLX())
18596 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
18597 DAG, Subtarget))
18598 return V;
18599
18600 // Try to match an interleave of two v8f32s and lower them as unpck and
18601 // permutes using ymms. This needs to go before we try to split the vectors.
18602 //
18603 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
18604 // this path inadvertently.
18605 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
18606 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
18607 Mask, DAG))
18608 return V;
18609
18610 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18611 // since after split we get a more efficient code using vpunpcklwd and
18612 // vpunpckhwd instrs than vblend.
18613 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
18614 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
18615 DAG);
18616
18617 // If we have AVX2 then we always want to lower with a blend because at v8 we
18618 // can fully permute the elements.
18619 if (Subtarget.hasAVX2())
18620 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
18621 Subtarget, DAG);
18622
18623 // Otherwise fall back on generic lowering.
18624 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
18625 Subtarget, DAG);
18626}
18627
18628/// Handle lowering of 8-lane 32-bit integer shuffles.
18629///
18630/// This routine is only called when we have AVX2 and thus a reasonable
18631/// instruction set for v8i32 shuffling..
18632static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18633 const APInt &Zeroable, SDValue V1, SDValue V2,
18634 const X86Subtarget &Subtarget,
18635 SelectionDAG &DAG) {
18636 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18636, __extension__
__PRETTY_FUNCTION__))
;
18637 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18637, __extension__
__PRETTY_FUNCTION__))
;
18638 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18638, __extension__
__PRETTY_FUNCTION__))
;
18639 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18639, __extension__
__PRETTY_FUNCTION__))
;
18640
18641 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
18642
18643 // Whenever we can lower this as a zext, that instruction is strictly faster
18644 // than any alternative. It also allows us to fold memory operands into the
18645 // shuffle in many cases.
18646 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18647 Zeroable, Subtarget, DAG))
18648 return ZExt;
18649
18650 // Try to match an interleave of two v8i32s and lower them as unpck and
18651 // permutes using ymms. This needs to go before we try to split the vectors.
18652 if (!Subtarget.hasAVX512())
18653 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
18654 Mask, DAG))
18655 return V;
18656
18657 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18658 // since after split we get a more efficient code than vblend by using
18659 // vpunpcklwd and vpunpckhwd instrs.
18660 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
18661 !Subtarget.hasAVX512())
18662 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
18663 DAG);
18664
18665 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
18666 Zeroable, Subtarget, DAG))
18667 return Blend;
18668
18669 // Check for being able to broadcast a single element.
18670 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
18671 Subtarget, DAG))
18672 return Broadcast;
18673
18674 // Try to use shift instructions if fast.
18675 if (Subtarget.preferLowerShuffleAsShift()) {
18676 if (SDValue Shift =
18677 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
18678 Subtarget, DAG, /*BitwiseOnly*/ true))
18679 return Shift;
18680 if (NumV2Elements == 0)
18681 if (SDValue Rotate =
18682 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18683 return Rotate;
18684 }
18685
18686 // If the shuffle mask is repeated in each 128-bit lane we can use more
18687 // efficient instructions that mirror the shuffles across the two 128-bit
18688 // lanes.
18689 SmallVector<int, 4> RepeatedMask;
18690 bool Is128BitLaneRepeatedShuffle =
18691 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
18692 if (Is128BitLaneRepeatedShuffle) {
18693 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18693, __extension__
__PRETTY_FUNCTION__))
;
18694 if (V2.isUndef())
18695 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
18696 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18697
18698 // Use dedicated unpack instructions for masks that match their pattern.
18699 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
18700 return V;
18701 }
18702
18703 // Try to use shift instructions.
18704 if (SDValue Shift =
18705 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
18706 DAG, /*BitwiseOnly*/ false))
18707 return Shift;
18708
18709 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
18710 if (SDValue Rotate =
18711 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18712 return Rotate;
18713
18714 // If we have VLX support, we can use VALIGN or EXPAND.
18715 if (Subtarget.hasVLX()) {
18716 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
18717 Subtarget, DAG))
18718 return Rotate;
18719
18720 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
18721 DAG, Subtarget))
18722 return V;
18723 }
18724
18725 // Try to use byte rotation instructions.
18726 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
18727 Subtarget, DAG))
18728 return Rotate;
18729
18730 // Try to create an in-lane repeating shuffle mask and then shuffle the
18731 // results into the target lanes.
18732 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18733 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18734 return V;
18735
18736 if (V2.isUndef()) {
18737 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18738 // because that should be faster than the variable permute alternatives.
18739 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
18740 return V;
18741
18742 // If the shuffle patterns aren't repeated but it's a single input, directly
18743 // generate a cross-lane VPERMD instruction.
18744 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18745 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
18746 }
18747
18748 // Assume that a single SHUFPS is faster than an alternative sequence of
18749 // multiple instructions (even if the CPU has a domain penalty).
18750 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18751 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18752 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
18753 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
18754 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
18755 CastV1, CastV2, DAG);
18756 return DAG.getBitcast(MVT::v8i32, ShufPS);
18757 }
18758
18759 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18760 // shuffle.
18761 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18762 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18763 return Result;
18764
18765 // Otherwise fall back on generic blend lowering.
18766 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
18767 Subtarget, DAG);
18768}
18769
18770/// Handle lowering of 16-lane 16-bit integer shuffles.
18771///
18772/// This routine is only called when we have AVX2 and thus a reasonable
18773/// instruction set for v16i16 shuffling..
18774static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18775 const APInt &Zeroable, SDValue V1, SDValue V2,
18776 const X86Subtarget &Subtarget,
18777 SelectionDAG &DAG) {
18778 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18778, __extension__
__PRETTY_FUNCTION__))
;
18779 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18779, __extension__
__PRETTY_FUNCTION__))
;
18780 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18780, __extension__
__PRETTY_FUNCTION__))
;
18781 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18781, __extension__
__PRETTY_FUNCTION__))
;
18782
18783 // Whenever we can lower this as a zext, that instruction is strictly faster
18784 // than any alternative. It also allows us to fold memory operands into the
18785 // shuffle in many cases.
18786 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18787 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18788 return ZExt;
18789
18790 // Check for being able to broadcast a single element.
18791 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
18792 Subtarget, DAG))
18793 return Broadcast;
18794
18795 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
18796 Zeroable, Subtarget, DAG))
18797 return Blend;
18798
18799 // Use dedicated unpack instructions for masks that match their pattern.
18800 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
18801 return V;
18802
18803 // Use dedicated pack instructions for masks that match their pattern.
18804 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
18805 Subtarget))
18806 return V;
18807
18808 // Try to use lower using a truncation.
18809 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18810 Subtarget, DAG))
18811 return V;
18812
18813 // Try to use shift instructions.
18814 if (SDValue Shift =
18815 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18816 Subtarget, DAG, /*BitwiseOnly*/ false))
18817 return Shift;
18818
18819 // Try to use byte rotation instructions.
18820 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
18821 Subtarget, DAG))
18822 return Rotate;
18823
18824 // Try to create an in-lane repeating shuffle mask and then shuffle the
18825 // results into the target lanes.
18826 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18827 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18828 return V;
18829
18830 if (V2.isUndef()) {
18831 // Try to use bit rotation instructions.
18832 if (SDValue Rotate =
18833 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
18834 return Rotate;
18835
18836 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18837 // because that should be faster than the variable permute alternatives.
18838 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
18839 return V;
18840
18841 // There are no generalized cross-lane shuffle operations available on i16
18842 // element types.
18843 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18844 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18845 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18846 return V;
18847
18848 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18849 DAG, Subtarget);
18850 }
18851
18852 SmallVector<int, 8> RepeatedMask;
18853 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18854 // As this is a single-input shuffle, the repeated mask should be
18855 // a strictly valid v8i16 mask that we can pass through to the v8i16
18856 // lowering to handle even the v16 case.
18857 return lowerV8I16GeneralSingleInputShuffle(
18858 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18859 }
18860 }
18861
18862 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
18863 Zeroable, Subtarget, DAG))
18864 return PSHUFB;
18865
18866 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18867 if (Subtarget.hasBWI())
18868 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18869
18870 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18871 // shuffle.
18872 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18873 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18874 return Result;
18875
18876 // Try to permute the lanes and then use a per-lane permute.
18877 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18878 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18879 return V;
18880
18881 // Try to match an interleave of two v16i16s and lower them as unpck and
18882 // permutes using ymms.
18883 if (!Subtarget.hasAVX512())
18884 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
18885 Mask, DAG))
18886 return V;
18887
18888 // Otherwise fall back on generic lowering.
18889 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18890 Subtarget, DAG);
18891}
18892
18893/// Handle lowering of 32-lane 8-bit integer shuffles.
18894///
18895/// This routine is only called when we have AVX2 and thus a reasonable
18896/// instruction set for v32i8 shuffling..
18897static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18898 const APInt &Zeroable, SDValue V1, SDValue V2,
18899 const X86Subtarget &Subtarget,
18900 SelectionDAG &DAG) {
18901 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18901, __extension__
__PRETTY_FUNCTION__))
;
18902 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18902, __extension__
__PRETTY_FUNCTION__))
;
18903 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18903, __extension__
__PRETTY_FUNCTION__))
;
18904 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18904, __extension__
__PRETTY_FUNCTION__))
;
18905
18906 // Whenever we can lower this as a zext, that instruction is strictly faster
18907 // than any alternative. It also allows us to fold memory operands into the
18908 // shuffle in many cases.
18909 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18910 Zeroable, Subtarget, DAG))
18911 return ZExt;
18912
18913 // Check for being able to broadcast a single element.
18914 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18915 Subtarget, DAG))
18916 return Broadcast;
18917
18918 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18919 Zeroable, Subtarget, DAG))
18920 return Blend;
18921
18922 // Use dedicated unpack instructions for masks that match their pattern.
18923 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18924 return V;
18925
18926 // Use dedicated pack instructions for masks that match their pattern.
18927 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18928 Subtarget))
18929 return V;
18930
18931 // Try to use lower using a truncation.
18932 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18933 Subtarget, DAG))
18934 return V;
18935
18936 // Try to use shift instructions.
18937 if (SDValue Shift =
18938 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
18939 DAG, /*BitwiseOnly*/ false))
18940 return Shift;
18941
18942 // Try to use byte rotation instructions.
18943 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18944 Subtarget, DAG))
18945 return Rotate;
18946
18947 // Try to use bit rotation instructions.
18948 if (V2.isUndef())
18949 if (SDValue Rotate =
18950 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18951 return Rotate;
18952
18953 // Try to create an in-lane repeating shuffle mask and then shuffle the
18954 // results into the target lanes.
18955 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18956 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18957 return V;
18958
18959 // There are no generalized cross-lane shuffle operations available on i8
18960 // element types.
18961 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18962 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18963 // because that should be faster than the variable permute alternatives.
18964 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18965 return V;
18966
18967 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18968 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18969 return V;
18970
18971 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18972 DAG, Subtarget);
18973 }
18974
18975 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18976 Zeroable, Subtarget, DAG))
18977 return PSHUFB;
18978
18979 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18980 if (Subtarget.hasVBMI())
18981 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18982
18983 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18984 // shuffle.
18985 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18986 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18987 return Result;
18988
18989 // Try to permute the lanes and then use a per-lane permute.
18990 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18991 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18992 return V;
18993
18994 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18995 // by zeroable elements in the remaining 24 elements. Turn this into two
18996 // vmovqb instructions shuffled together.
18997 if (Subtarget.hasVLX())
18998 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18999 Mask, Zeroable, DAG))
19000 return V;
19001
19002 // Try to match an interleave of two v32i8s and lower them as unpck and
19003 // permutes using ymms.
19004 if (!Subtarget.hasAVX512())
19005 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
19006 Mask, DAG))
19007 return V;
19008
19009 // Otherwise fall back on generic lowering.
19010 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
19011 Subtarget, DAG);
19012}
19013
19014/// High-level routine to lower various 256-bit x86 vector shuffles.
19015///
19016/// This routine either breaks down the specific type of a 256-bit x86 vector
19017/// shuffle or splits it into two 128-bit shuffles and fuses the results back
19018/// together based on the available instructions.
19019static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
19020 SDValue V1, SDValue V2, const APInt &Zeroable,
19021 const X86Subtarget &Subtarget,
19022 SelectionDAG &DAG) {
19023 // If we have a single input to the zero element, insert that into V1 if we
19024 // can do so cheaply.
19025 int NumElts = VT.getVectorNumElements();
19026 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19027
19028 if (NumV2Elements == 1 && Mask[0] >= NumElts)
19029 if (SDValue Insertion = lowerShuffleAsElementInsertion(
19030 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19031 return Insertion;
19032
19033 // Handle special cases where the lower or upper half is UNDEF.
19034 if (SDValue V =
19035 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19036 return V;
19037
19038 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
19039 // can check for those subtargets here and avoid much of the subtarget
19040 // querying in the per-vector-type lowering routines. With AVX1 we have
19041 // essentially *zero* ability to manipulate a 256-bit vector with integer
19042 // types. Since we'll use floating point types there eventually, just
19043 // immediately cast everything to a float and operate entirely in that domain.
19044 if (VT.isInteger() && !Subtarget.hasAVX2()) {
19045 int ElementBits = VT.getScalarSizeInBits();
19046 if (ElementBits < 32) {
19047 // No floating point type available, if we can't use the bit operations
19048 // for masking/blending then decompose into 128-bit vectors.
19049 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19050 Subtarget, DAG))
19051 return V;
19052 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19053 return V;
19054 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19055 }
19056
19057 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
19058 VT.getVectorNumElements());
19059 V1 = DAG.getBitcast(FpVT, V1);
19060 V2 = DAG.getBitcast(FpVT, V2);
19061 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
19062 }
19063
19064 if (VT == MVT::v16f16) {
19065 V1 = DAG.getBitcast(MVT::v16i16, V1);
19066 V2 = DAG.getBitcast(MVT::v16i16, V2);
19067 return DAG.getBitcast(MVT::v16f16,
19068 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
19069 }
19070
19071 switch (VT.SimpleTy) {
19072 case MVT::v4f64:
19073 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19074 case MVT::v4i64:
19075 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19076 case MVT::v8f32:
19077 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19078 case MVT::v8i32:
19079 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19080 case MVT::v16i16:
19081 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19082 case MVT::v32i8:
19083 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19084
19085 default:
19086 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19086)
;
19087 }
19088}
19089
19090/// Try to lower a vector shuffle as a 128-bit shuffles.
19091static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
19092 const APInt &Zeroable, SDValue V1, SDValue V2,
19093 const X86Subtarget &Subtarget,
19094 SelectionDAG &DAG) {
19095 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__
__PRETTY_FUNCTION__))
19096 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__
__PRETTY_FUNCTION__))
;
19097
19098 // To handle 256 bit vector requires VLX and most probably
19099 // function lowerV2X128VectorShuffle() is better solution.
19100 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19100, __extension__
__PRETTY_FUNCTION__))
;
19101
19102 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
19103 SmallVector<int, 4> Widened128Mask;
19104 if (!canWidenShuffleElements(Mask, Widened128Mask))
19105 return SDValue();
19106 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19106, __extension__
__PRETTY_FUNCTION__))
;
19107
19108 // Try to use an insert into a zero vector.
19109 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
19110 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
19111 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
19112 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
19113 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
19114 DAG.getIntPtrConstant(0, DL));
19115 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19116 getZeroVector(VT, Subtarget, DAG, DL), LoV,
19117 DAG.getIntPtrConstant(0, DL));
19118 }
19119
19120 // Check for patterns which can be matched with a single insert of a 256-bit
19121 // subvector.
19122 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
19123 if (OnlyUsesV1 ||
19124 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
19125 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
19126 SDValue SubVec =
19127 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
19128 DAG.getIntPtrConstant(0, DL));
19129 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
19130 DAG.getIntPtrConstant(4, DL));
19131 }
19132
19133 // See if this is an insertion of the lower 128-bits of V2 into V1.
19134 bool IsInsert = true;
19135 int V2Index = -1;
19136 for (int i = 0; i < 4; ++i) {
19137 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19137, __extension__
__PRETTY_FUNCTION__))
;
19138 if (Widened128Mask[i] < 0)
19139 continue;
19140
19141 // Make sure all V1 subvectors are in place.
19142 if (Widened128Mask[i] < 4) {
19143 if (Widened128Mask[i] != i) {
19144 IsInsert = false;
19145 break;
19146 }
19147 } else {
19148 // Make sure we only have a single V2 index and its the lowest 128-bits.
19149 if (V2Index >= 0 || Widened128Mask[i] != 4) {
19150 IsInsert = false;
19151 break;
19152 }
19153 V2Index = i;
19154 }
19155 }
19156 if (IsInsert && V2Index >= 0) {
19157 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
19158 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
19159 DAG.getIntPtrConstant(0, DL));
19160 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
19161 }
19162
19163 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
19164 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
19165 // possible we at least ensure the lanes stay sequential to help later
19166 // combines.
19167 SmallVector<int, 2> Widened256Mask;
19168 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
19169 Widened128Mask.clear();
19170 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
19171 }
19172
19173 // Try to lower to vshuf64x2/vshuf32x4.
19174 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
19175 unsigned PermMask = 0;
19176 // Insure elements came from the same Op.
19177 for (int i = 0; i < 4; ++i) {
19178 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19178, __extension__
__PRETTY_FUNCTION__))
;
19179 if (Widened128Mask[i] < 0)
19180 continue;
19181
19182 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
19183 unsigned OpIndex = i / 2;
19184 if (Ops[OpIndex].isUndef())
19185 Ops[OpIndex] = Op;
19186 else if (Ops[OpIndex] != Op)
19187 return SDValue();
19188
19189 // Convert the 128-bit shuffle mask selection values into 128-bit selection
19190 // bits defined by a vshuf64x2 instruction's immediate control byte.
19191 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
19192 }
19193
19194 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
19195 DAG.getTargetConstant(PermMask, DL, MVT::i8));
19196}
19197
19198/// Handle lowering of 8-lane 64-bit floating point shuffles.
19199static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19200 const APInt &Zeroable, SDValue V1, SDValue V2,
19201 const X86Subtarget &Subtarget,
19202 SelectionDAG &DAG) {
19203 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19203, __extension__
__PRETTY_FUNCTION__))
;
19204 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19204, __extension__
__PRETTY_FUNCTION__))
;
19205 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19205, __extension__
__PRETTY_FUNCTION__))
;
19206
19207 if (V2.isUndef()) {
19208 // Use low duplicate instructions for masks that match their pattern.
19209 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
19210 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
19211
19212 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
19213 // Non-half-crossing single input shuffles can be lowered with an
19214 // interleaved permutation.
19215 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
19216 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
19217 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
19218 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
19219 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
19220 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
19221 }
19222
19223 SmallVector<int, 4> RepeatedMask;
19224 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
19225 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
19226 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19227 }
19228
19229 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
19230 V2, Subtarget, DAG))
19231 return Shuf128;
19232
19233 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
19234 return Unpck;
19235
19236 // Check if the blend happens to exactly fit that of SHUFPD.
19237 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
19238 Zeroable, Subtarget, DAG))
19239 return Op;
19240
19241 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
19242 DAG, Subtarget))
19243 return V;
19244
19245 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
19246 Zeroable, Subtarget, DAG))
19247 return Blend;
19248
19249 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
19250}
19251
19252/// Handle lowering of 16-lane 32-bit floating point shuffles.
19253static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19254 const APInt &Zeroable, SDValue V1, SDValue V2,
19255 const X86Subtarget &Subtarget,
19256 SelectionDAG &DAG) {
19257 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19257, __extension__
__PRETTY_FUNCTION__))
;
19258 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19258, __extension__
__PRETTY_FUNCTION__))
;
19259 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19259, __extension__
__PRETTY_FUNCTION__))
;
19260
19261 // If the shuffle mask is repeated in each 128-bit lane, we have many more
19262 // options to efficiently lower the shuffle.
19263 SmallVector<int, 4> RepeatedMask;
19264 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
19265 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19265, __extension__
__PRETTY_FUNCTION__))
;
19266
19267 // Use even/odd duplicate instructions for masks that match their pattern.
19268 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
19269 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
19270 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
19271 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
19272
19273 if (V2.isUndef())
19274 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
19275 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19276
19277 // Use dedicated unpack instructions for masks that match their pattern.
19278 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
19279 return V;
19280
19281 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19282 Zeroable, Subtarget, DAG))
19283 return Blend;
19284
19285 // Otherwise, fall back to a SHUFPS sequence.
19286 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
19287 }
19288
19289 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19290 Zeroable, Subtarget, DAG))
19291 return Blend;
19292
19293 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19294 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19295 return DAG.getBitcast(MVT::v16f32, ZExt);
19296
19297 // Try to create an in-lane repeating shuffle mask and then shuffle the
19298 // results into the target lanes.
19299 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19300 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
19301 return V;
19302
19303 // If we have a single input shuffle with different shuffle patterns in the
19304 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
19305 if (V2.isUndef() &&
19306 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
19307 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
19308 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
19309 }
19310
19311 // If we have AVX512F support, we can use VEXPAND.
19312 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
19313 V1, V2, DAG, Subtarget))
19314 return V;
19315
19316 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
19317}
19318
19319/// Handle lowering of 8-lane 64-bit integer shuffles.
19320static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19321 const APInt &Zeroable, SDValue V1, SDValue V2,
19322 const X86Subtarget &Subtarget,
19323 SelectionDAG &DAG) {
19324 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19324, __extension__
__PRETTY_FUNCTION__))
;
19325 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19325, __extension__
__PRETTY_FUNCTION__))
;
19326 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19326, __extension__
__PRETTY_FUNCTION__))
;
19327
19328 // Try to use shift instructions if fast.
19329 if (Subtarget.preferLowerShuffleAsShift())
19330 if (SDValue Shift =
19331 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
19332 Subtarget, DAG, /*BitwiseOnly*/ true))
19333 return Shift;
19334
19335 if (V2.isUndef()) {
19336 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
19337 // can use lower latency instructions that will operate on all four
19338 // 128-bit lanes.
19339 SmallVector<int, 2> Repeated128Mask;
19340 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
19341 SmallVector<int, 4> PSHUFDMask;
19342 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
19343 return DAG.getBitcast(
19344 MVT::v8i64,
19345 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
19346 DAG.getBitcast(MVT::v16i32, V1),
19347 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
19348 }
19349
19350 SmallVector<int, 4> Repeated256Mask;
19351 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
19352 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
19353 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
19354 }
19355
19356 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
19357 V2, Subtarget, DAG))
19358 return Shuf128;
19359
19360 // Try to use shift instructions.
19361 if (SDValue Shift =
19362 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
19363 DAG, /*BitwiseOnly*/ false))
19364 return Shift;
19365
19366 // Try to use VALIGN.
19367 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
19368 Subtarget, DAG))
19369 return Rotate;
19370
19371 // Try to use PALIGNR.
19372 if (Subtarget.hasBWI())
19373 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
19374 Subtarget, DAG))
19375 return Rotate;
19376
19377 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
19378 return Unpck;
19379
19380 // If we have AVX512F support, we can use VEXPAND.
19381 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
19382 DAG, Subtarget))
19383 return V;
19384
19385 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
19386 Zeroable, Subtarget, DAG))
19387 return Blend;
19388
19389 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
19390}
19391
19392/// Handle lowering of 16-lane 32-bit integer shuffles.
19393static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19394 const APInt &Zeroable, SDValue V1, SDValue V2,
19395 const X86Subtarget &Subtarget,
19396 SelectionDAG &DAG) {
19397 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19397, __extension__
__PRETTY_FUNCTION__))
;
19398 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19398, __extension__
__PRETTY_FUNCTION__))
;
19399 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19399, __extension__
__PRETTY_FUNCTION__))
;
19400
19401 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
19402
19403 // Whenever we can lower this as a zext, that instruction is strictly faster
19404 // than any alternative. It also allows us to fold memory operands into the
19405 // shuffle in many cases.
19406 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19407 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19408 return ZExt;
19409
19410 // Try to use shift instructions if fast.
19411 if (Subtarget.preferLowerShuffleAsShift()) {
19412 if (SDValue Shift =
19413 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19414 Subtarget, DAG, /*BitwiseOnly*/ true))
19415 return Shift;
19416 if (NumV2Elements == 0)
19417 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
19418 Subtarget, DAG))
19419 return Rotate;
19420 }
19421
19422 // If the shuffle mask is repeated in each 128-bit lane we can use more
19423 // efficient instructions that mirror the shuffles across the four 128-bit
19424 // lanes.
19425 SmallVector<int, 4> RepeatedMask;
19426 bool Is128BitLaneRepeatedShuffle =
19427 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
19428 if (Is128BitLaneRepeatedShuffle) {
19429 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19429, __extension__
__PRETTY_FUNCTION__))
;
19430 if (V2.isUndef())
19431 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
19432 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19433
19434 // Use dedicated unpack instructions for masks that match their pattern.
19435 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
19436 return V;
19437 }
19438
19439 // Try to use shift instructions.
19440 if (SDValue Shift =
19441 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19442 Subtarget, DAG, /*BitwiseOnly*/ false))
19443 return Shift;
19444
19445 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
19446 if (SDValue Rotate =
19447 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
19448 return Rotate;
19449
19450 // Try to use VALIGN.
19451 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
19452 Subtarget, DAG))
19453 return Rotate;
19454
19455 // Try to use byte rotation instructions.
19456 if (Subtarget.hasBWI())
19457 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
19458 Subtarget, DAG))
19459 return Rotate;
19460
19461 // Assume that a single SHUFPS is faster than using a permv shuffle.
19462 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
19463 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
19464 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
19465 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
19466 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
19467 CastV1, CastV2, DAG);
19468 return DAG.getBitcast(MVT::v16i32, ShufPS);
19469 }
19470
19471 // Try to create an in-lane repeating shuffle mask and then shuffle the
19472 // results into the target lanes.
19473 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19474 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
19475 return V;
19476
19477 // If we have AVX512F support, we can use VEXPAND.
19478 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
19479 DAG, Subtarget))
19480 return V;
19481
19482 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
19483 Zeroable, Subtarget, DAG))
19484 return Blend;
19485
19486 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
19487}
19488
19489/// Handle lowering of 32-lane 16-bit integer shuffles.
19490static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19491 const APInt &Zeroable, SDValue V1, SDValue V2,
19492 const X86Subtarget &Subtarget,
19493 SelectionDAG &DAG) {
19494 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19494, __extension__
__PRETTY_FUNCTION__))
;
19495 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19495, __extension__
__PRETTY_FUNCTION__))
;
19496 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__))
;
19497 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19497, __extension__
__PRETTY_FUNCTION__))
;
19498
19499 // Whenever we can lower this as a zext, that instruction is strictly faster
19500 // than any alternative. It also allows us to fold memory operands into the
19501 // shuffle in many cases.
19502 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19503 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
19504 return ZExt;
19505
19506 // Use dedicated unpack instructions for masks that match their pattern.
19507 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
19508 return V;
19509
19510 // Use dedicated pack instructions for masks that match their pattern.
19511 if (SDValue V =
19512 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
19513 return V;
19514
19515 // Try to use shift instructions.
19516 if (SDValue Shift =
19517 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
19518 Subtarget, DAG, /*BitwiseOnly*/ false))
19519 return Shift;
19520
19521 // Try to use byte rotation instructions.
19522 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
19523 Subtarget, DAG))
19524 return Rotate;
19525
19526 if (V2.isUndef()) {
19527 // Try to use bit rotation instructions.
19528 if (SDValue Rotate =
19529 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
19530 return Rotate;
19531
19532 SmallVector<int, 8> RepeatedMask;
19533 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
19534 // As this is a single-input shuffle, the repeated mask should be
19535 // a strictly valid v8i16 mask that we can pass through to the v8i16
19536 // lowering to handle even the v32 case.
19537 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
19538 RepeatedMask, Subtarget, DAG);
19539 }
19540 }
19541
19542 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
19543 Zeroable, Subtarget, DAG))
19544 return Blend;
19545
19546 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
19547 Zeroable, Subtarget, DAG))
19548 return PSHUFB;
19549
19550 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
19551}
19552
19553/// Handle lowering of 64-lane 8-bit integer shuffles.
19554static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19555 const APInt &Zeroable, SDValue V1, SDValue V2,
19556 const X86Subtarget &Subtarget,
19557 SelectionDAG &DAG) {
19558 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19558, __extension__
__PRETTY_FUNCTION__))
;
19559 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19559, __extension__
__PRETTY_FUNCTION__))
;
19560 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19560, __extension__
__PRETTY_FUNCTION__))
;
19561 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19561, __extension__
__PRETTY_FUNCTION__))
;
19562
19563 // Whenever we can lower this as a zext, that instruction is strictly faster
19564 // than any alternative. It also allows us to fold memory operands into the
19565 // shuffle in many cases.
19566 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19567 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
19568 return ZExt;
19569
19570 // Use dedicated unpack instructions for masks that match their pattern.
19571 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
19572 return V;
19573
19574 // Use dedicated pack instructions for masks that match their pattern.
19575 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
19576 Subtarget))
19577 return V;
19578
19579 // Try to use shift instructions.
19580 if (SDValue Shift =
19581 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
19582 DAG, /*BitwiseOnly*/ false))
19583 return Shift;
19584
19585 // Try to use byte rotation instructions.
19586 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
19587 Subtarget, DAG))
19588 return Rotate;
19589
19590 // Try to use bit rotation instructions.
19591 if (V2.isUndef())
19592 if (SDValue Rotate =
19593 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
19594 return Rotate;
19595
19596 // Lower as AND if possible.
19597 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
19598 Zeroable, Subtarget, DAG))
19599 return Masked;
19600
19601 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
19602 Zeroable, Subtarget, DAG))
19603 return PSHUFB;
19604
19605 // Try to create an in-lane repeating shuffle mask and then shuffle the
19606 // results into the target lanes.
19607 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19608 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19609 return V;
19610
19611 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
19612 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
19613 return Result;
19614
19615 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
19616 Zeroable, Subtarget, DAG))
19617 return Blend;
19618
19619 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
19620 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
19621 // PALIGNR will be cheaper than the second PSHUFB+OR.
19622 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
19623 Mask, Subtarget, DAG))
19624 return V;
19625
19626 // If we can't directly blend but can use PSHUFB, that will be better as it
19627 // can both shuffle and set up the inefficient blend.
19628 bool V1InUse, V2InUse;
19629 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
19630 DAG, V1InUse, V2InUse);
19631 }
19632
19633 // Try to simplify this by merging 128-bit lanes to enable a lane-based
19634 // shuffle.
19635 if (!V2.isUndef())
19636 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19637 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19638 return Result;
19639
19640 // VBMI can use VPERMV/VPERMV3 byte shuffles.
19641 if (Subtarget.hasVBMI())
19642 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
19643
19644 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19645}
19646
19647/// High-level routine to lower various 512-bit x86 vector shuffles.
19648///
19649/// This routine either breaks down the specific type of a 512-bit x86 vector
19650/// shuffle or splits it into two 256-bit shuffles and fuses the results back
19651/// together based on the available instructions.
19652static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19653 MVT VT, SDValue V1, SDValue V2,
19654 const APInt &Zeroable,
19655 const X86Subtarget &Subtarget,
19656 SelectionDAG &DAG) {
19657 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__
__PRETTY_FUNCTION__))
19658 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__
__PRETTY_FUNCTION__))
;
19659
19660 // If we have a single input to the zero element, insert that into V1 if we
19661 // can do so cheaply.
19662 int NumElts = Mask.size();
19663 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19664
19665 if (NumV2Elements == 1 && Mask[0] >= NumElts)
19666 if (SDValue Insertion = lowerShuffleAsElementInsertion(
19667 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19668 return Insertion;
19669
19670 // Handle special cases where the lower or upper half is UNDEF.
19671 if (SDValue V =
19672 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19673 return V;
19674
19675 // Check for being able to broadcast a single element.
19676 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
19677 Subtarget, DAG))
19678 return Broadcast;
19679
19680 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
19681 // Try using bit ops for masking and blending before falling back to
19682 // splitting.
19683 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19684 Subtarget, DAG))
19685 return V;
19686 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19687 return V;
19688
19689 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19690 }
19691
19692 if (VT == MVT::v32f16) {
19693 V1 = DAG.getBitcast(MVT::v32i16, V1);
19694 V2 = DAG.getBitcast(MVT::v32i16, V2);
19695 return DAG.getBitcast(MVT::v32f16,
19696 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
19697 }
19698
19699 // Dispatch to each element type for lowering. If we don't have support for
19700 // specific element type shuffles at 512 bits, immediately split them and
19701 // lower them. Each lowering routine of a given type is allowed to assume that
19702 // the requisite ISA extensions for that element type are available.
19703 switch (VT.SimpleTy) {
19704 case MVT::v8f64:
19705 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19706 case MVT::v16f32:
19707 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19708 case MVT::v8i64:
19709 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19710 case MVT::v16i32:
19711 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19712 case MVT::v32i16:
19713 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19714 case MVT::v64i8:
19715 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19716
19717 default:
19718 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19718)
;
19719 }
19720}
19721
19722static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
19723 MVT VT, SDValue V1, SDValue V2,
19724 const X86Subtarget &Subtarget,
19725 SelectionDAG &DAG) {
19726 // Shuffle should be unary.
19727 if (!V2.isUndef())
19728 return SDValue();
19729
19730 int ShiftAmt = -1;
19731 int NumElts = Mask.size();
19732 for (int i = 0; i != NumElts; ++i) {
19733 int M = Mask[i];
19734 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__
__PRETTY_FUNCTION__))
19735 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__
__PRETTY_FUNCTION__))
;
19736 if (M < 0)
19737 continue;
19738
19739 // The first non-undef element determines our shift amount.
19740 if (ShiftAmt < 0) {
19741 ShiftAmt = M - i;
19742 // Need to be shifting right.
19743 if (ShiftAmt <= 0)
19744 return SDValue();
19745 }
19746 // All non-undef elements must shift by the same amount.
19747 if (ShiftAmt != M - i)
19748 return SDValue();
19749 }
19750 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19750, __extension__
__PRETTY_FUNCTION__))
;
19751
19752 // Great we found a shift right.
19753 MVT WideVT = VT;
19754 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19755 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19756 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19757 DAG.getUNDEF(WideVT), V1,
19758 DAG.getIntPtrConstant(0, DL));
19759 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
19760 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19761 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19762 DAG.getIntPtrConstant(0, DL));
19763}
19764
19765// Determine if this shuffle can be implemented with a KSHIFT instruction.
19766// Returns the shift amount if possible or -1 if not. This is a simplified
19767// version of matchShuffleAsShift.
19768static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
19769 int MaskOffset, const APInt &Zeroable) {
19770 int Size = Mask.size();
19771
19772 auto CheckZeros = [&](int Shift, bool Left) {
19773 for (int j = 0; j < Shift; ++j)
19774 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
19775 return false;
19776
19777 return true;
19778 };
19779
19780 auto MatchShift = [&](int Shift, bool Left) {
19781 unsigned Pos = Left ? Shift : 0;
19782 unsigned Low = Left ? 0 : Shift;
19783 unsigned Len = Size - Shift;
19784 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
19785 };
19786
19787 for (int Shift = 1; Shift != Size; ++Shift)
19788 for (bool Left : {true, false})
19789 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
19790 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
19791 return Shift;
19792 }
19793
19794 return -1;
19795}
19796
19797
19798// Lower vXi1 vector shuffles.
19799// There is no a dedicated instruction on AVX-512 that shuffles the masks.
19800// The only way to shuffle bits is to sign-extend the mask vector to SIMD
19801// vector, shuffle and then truncate it back.
19802static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19803 MVT VT, SDValue V1, SDValue V2,
19804 const APInt &Zeroable,
19805 const X86Subtarget &Subtarget,
19806 SelectionDAG &DAG) {
19807 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__
__PRETTY_FUNCTION__))
19808 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__
__PRETTY_FUNCTION__))
;
19809
19810 int NumElts = Mask.size();
19811
19812 // Try to recognize shuffles that are just padding a subvector with zeros.
19813 int SubvecElts = 0;
19814 int Src = -1;
19815 for (int i = 0; i != NumElts; ++i) {
19816 if (Mask[i] >= 0) {
19817 // Grab the source from the first valid mask. All subsequent elements need
19818 // to use this same source.
19819 if (Src < 0)
19820 Src = Mask[i] / NumElts;
19821 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
19822 break;
19823 }
19824
19825 ++SubvecElts;
19826 }
19827 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19827, __extension__
__PRETTY_FUNCTION__))
;
19828
19829 // Clip to a power 2.
19830 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
19831
19832 // Make sure the number of zeroable bits in the top at least covers the bits
19833 // not covered by the subvector.
19834 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
19835 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19835, __extension__
__PRETTY_FUNCTION__))
;
19836 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
19837 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
19838 Src == 0 ? V1 : V2,
19839 DAG.getIntPtrConstant(0, DL));
19840 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19841 DAG.getConstant(0, DL, VT),
19842 Extract, DAG.getIntPtrConstant(0, DL));
19843 }
19844
19845 // Try a simple shift right with undef elements. Later we'll try with zeros.
19846 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
19847 DAG))
19848 return Shift;
19849
19850 // Try to match KSHIFTs.
19851 unsigned Offset = 0;
19852 for (SDValue V : { V1, V2 }) {
19853 unsigned Opcode;
19854 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
19855 if (ShiftAmt >= 0) {
19856 MVT WideVT = VT;
19857 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19858 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19859 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19860 DAG.getUNDEF(WideVT), V,
19861 DAG.getIntPtrConstant(0, DL));
19862 // Widened right shifts need two shifts to ensure we shift in zeroes.
19863 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
19864 int WideElts = WideVT.getVectorNumElements();
19865 // Shift left to put the original vector in the MSBs of the new size.
19866 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
19867 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
19868 // Increase the shift amount to account for the left shift.
19869 ShiftAmt += WideElts - NumElts;
19870 }
19871
19872 Res = DAG.getNode(Opcode, DL, WideVT, Res,
19873 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19874 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19875 DAG.getIntPtrConstant(0, DL));
19876 }
19877 Offset += NumElts; // Increment for next iteration.
19878 }
19879
19880 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
19881 // TODO: What other unary shuffles would benefit from this?
19882 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
19883 V1->hasOneUse()) {
19884 SDValue Op0 = V1.getOperand(0);
19885 SDValue Op1 = V1.getOperand(1);
19886 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
19887 EVT OpVT = Op0.getValueType();
19888 return DAG.getSetCC(
19889 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
19890 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
19891 }
19892
19893 MVT ExtVT;
19894 switch (VT.SimpleTy) {
19895 default:
19896 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19896)
;
19897 case MVT::v2i1:
19898 ExtVT = MVT::v2i64;
19899 break;
19900 case MVT::v4i1:
19901 ExtVT = MVT::v4i32;
19902 break;
19903 case MVT::v8i1:
19904 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19905 // shuffle.
19906 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19907 break;
19908 case MVT::v16i1:
19909 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19910 // 256-bit operation available.
19911 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19912 break;
19913 case MVT::v32i1:
19914 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19915 // 256-bit operation available.
19916 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19916, __extension__
__PRETTY_FUNCTION__))
;
19917 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19918 break;
19919 case MVT::v64i1:
19920 // Fall back to scalarization. FIXME: We can do better if the shuffle
19921 // can be partitioned cleanly.
19922 if (!Subtarget.useBWIRegs())
19923 return SDValue();
19924 ExtVT = MVT::v64i8;
19925 break;
19926 }
19927
19928 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
19929 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
19930
19931 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
19932 // i1 was sign extended we can use X86ISD::CVT2MASK.
19933 int NumElems = VT.getVectorNumElements();
19934 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
19935 (Subtarget.hasDQI() && (NumElems < 32)))
19936 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
19937 Shuffle, ISD::SETGT);
19938
19939 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
19940}
19941
19942/// Helper function that returns true if the shuffle mask should be
19943/// commuted to improve canonicalization.
19944static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
19945 int NumElements = Mask.size();
19946
19947 int NumV1Elements = 0, NumV2Elements = 0;
19948 for (int M : Mask)
19949 if (M < 0)
19950 continue;
19951 else if (M < NumElements)
19952 ++NumV1Elements;
19953 else
19954 ++NumV2Elements;
19955
19956 // Commute the shuffle as needed such that more elements come from V1 than
19957 // V2. This allows us to match the shuffle pattern strictly on how many
19958 // elements come from V1 without handling the symmetric cases.
19959 if (NumV2Elements > NumV1Elements)
19960 return true;
19961
19962 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19962, __extension__
__PRETTY_FUNCTION__))
;
19963
19964 if (NumV2Elements == 0)
19965 return false;
19966
19967 // When the number of V1 and V2 elements are the same, try to minimize the
19968 // number of uses of V2 in the low half of the vector. When that is tied,
19969 // ensure that the sum of indices for V1 is equal to or lower than the sum
19970 // indices for V2. When those are equal, try to ensure that the number of odd
19971 // indices for V1 is lower than the number of odd indices for V2.
19972 if (NumV1Elements == NumV2Elements) {
19973 int LowV1Elements = 0, LowV2Elements = 0;
19974 for (int M : Mask.slice(0, NumElements / 2))
19975 if (M >= NumElements)
19976 ++LowV2Elements;
19977 else if (M >= 0)
19978 ++LowV1Elements;
19979 if (LowV2Elements > LowV1Elements)
19980 return true;
19981 if (LowV2Elements == LowV1Elements) {
19982 int SumV1Indices = 0, SumV2Indices = 0;
19983 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19984 if (Mask[i] >= NumElements)
19985 SumV2Indices += i;
19986 else if (Mask[i] >= 0)
19987 SumV1Indices += i;
19988 if (SumV2Indices < SumV1Indices)
19989 return true;
19990 if (SumV2Indices == SumV1Indices) {
19991 int NumV1OddIndices = 0, NumV2OddIndices = 0;
19992 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19993 if (Mask[i] >= NumElements)
19994 NumV2OddIndices += i % 2;
19995 else if (Mask[i] >= 0)
19996 NumV1OddIndices += i % 2;
19997 if (NumV2OddIndices < NumV1OddIndices)
19998 return true;
19999 }
20000 }
20001 }
20002
20003 return false;
20004}
20005
20006static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,
20007 const X86Subtarget &Subtarget) {
20008 if (!Subtarget.hasAVX512())
20009 return false;
20010
20011 MVT VT = V1.getSimpleValueType().getScalarType();
20012 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
20013 return false;
20014
20015 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
20016 // are preferable to blendw/blendvb/masked-mov.
20017 if ((VT == MVT::i16 || VT == MVT::i8) &&
20018 V1.getSimpleValueType().getSizeInBits() < 512)
20019 return false;
20020
20021 auto HasMaskOperation = [&](SDValue V) {
20022 // TODO: Currently we only check limited opcode. We probably extend
20023 // it to all binary operation by checking TLI.isBinOp().
20024 switch (V->getOpcode()) {
20025 default:
20026 return false;
20027 case ISD::ADD:
20028 case ISD::SUB:
20029 case ISD::AND:
20030 case ISD::XOR:
20031 case ISD::OR:
20032 case ISD::SMAX:
20033 case ISD::SMIN:
20034 case ISD::UMAX:
20035 case ISD::UMIN:
20036 case ISD::ABS:
20037 case ISD::SHL:
20038 case ISD::SRL:
20039 case ISD::SRA:
20040 case ISD::MUL:
20041 break;
20042 }
20043 if (!V->hasOneUse())
20044 return false;
20045
20046 return true;
20047 };
20048
20049 if (HasMaskOperation(V1) || HasMaskOperation(V2))
20050 return true;
20051
20052 return false;
20053}
20054
20055// Forward declaration.
20056static SDValue canonicalizeShuffleMaskWithHorizOp(
20057 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
20058 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
20059 const X86Subtarget &Subtarget);
20060
20061 /// Top-level lowering for x86 vector shuffles.
20062///
20063/// This handles decomposition, canonicalization, and lowering of all x86
20064/// vector shuffles. Most of the specific lowering strategies are encapsulated
20065/// above in helper routines. The canonicalization attempts to widen shuffles
20066/// to involve fewer lanes of wider elements, consolidate symmetric patterns
20067/// s.t. only one of the two inputs needs to be tested, etc.
20068static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
20069 SelectionDAG &DAG) {
20070 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
20071 ArrayRef<int> OrigMask = SVOp->getMask();
20072 SDValue V1 = Op.getOperand(0);
20073 SDValue V2 = Op.getOperand(1);
20074 MVT VT = Op.getSimpleValueType();
20075 int NumElements = VT.getVectorNumElements();
20076 SDLoc DL(Op);
20077 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
20078
20079 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__
__PRETTY_FUNCTION__))
20080 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__
__PRETTY_FUNCTION__))
;
20081
20082 bool V1IsUndef = V1.isUndef();
20083 bool V2IsUndef = V2.isUndef();
20084 if (V1IsUndef && V2IsUndef)
20085 return DAG.getUNDEF(VT);
20086
20087 // When we create a shuffle node we put the UNDEF node to second operand,
20088 // but in some cases the first operand may be transformed to UNDEF.
20089 // In this case we should just commute the node.
20090 if (V1IsUndef)
20091 return DAG.getCommutedVectorShuffle(*SVOp);
20092
20093 // Check for non-undef masks pointing at an undef vector and make the masks
20094 // undef as well. This makes it easier to match the shuffle based solely on
20095 // the mask.
20096 if (V2IsUndef &&
20097 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
20098 SmallVector<int, 8> NewMask(OrigMask);
20099 for (int &M : NewMask)
20100 if (M >= NumElements)
20101 M = -1;
20102 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
20103 }
20104
20105 // Check for illegal shuffle mask element index values.
20106 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
20107 (void)MaskUpperLimit;
20108 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))
20109 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))
20110 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))
;
20111
20112 // We actually see shuffles that are entirely re-arrangements of a set of
20113 // zero inputs. This mostly happens while decomposing complex shuffles into
20114 // simple ones. Directly lower these as a buildvector of zeros.
20115 APInt KnownUndef, KnownZero;
20116 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
20117
20118 APInt Zeroable = KnownUndef | KnownZero;
20119 if (Zeroable.isAllOnes())
20120 return getZeroVector(VT, Subtarget, DAG, DL);
20121
20122 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
20123
20124 // Try to collapse shuffles into using a vector type with fewer elements but
20125 // wider element types. We cap this to not form integers or floating point
20126 // elements wider than 64 bits. It does not seem beneficial to form i128
20127 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
20128 SmallVector<int, 16> WidenedMask;
20129 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
20130 !canCombineAsMaskOperation(V1, V2, Subtarget) &&
20131 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
20132 // Shuffle mask widening should not interfere with a broadcast opportunity
20133 // by obfuscating the operands with bitcasts.
20134 // TODO: Avoid lowering directly from this top-level function: make this
20135 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
20136 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
20137 Subtarget, DAG))
20138 return Broadcast;
20139
20140 MVT NewEltVT = VT.isFloatingPoint()
20141 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
20142 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
20143 int NewNumElts = NumElements / 2;
20144 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
20145 // Make sure that the new vector type is legal. For example, v2f64 isn't
20146 // legal on SSE1.
20147 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
20148 if (V2IsZero) {
20149 // Modify the new Mask to take all zeros from the all-zero vector.
20150 // Choose indices that are blend-friendly.
20151 bool UsedZeroVector = false;
20152 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__
__PRETTY_FUNCTION__))
20153 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__
__PRETTY_FUNCTION__))
;
20154 for (int i = 0; i != NewNumElts; ++i)
20155 if (WidenedMask[i] == SM_SentinelZero) {
20156 WidenedMask[i] = i + NewNumElts;
20157 UsedZeroVector = true;
20158 }
20159 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
20160 // some elements to be undef.
20161 if (UsedZeroVector)
20162 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
20163 }
20164 V1 = DAG.getBitcast(NewVT, V1);
20165 V2 = DAG.getBitcast(NewVT, V2);
20166 return DAG.getBitcast(
20167 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
20168 }
20169 }
20170
20171 SmallVector<SDValue> Ops = {V1, V2};
20172 SmallVector<int> Mask(OrigMask);
20173
20174 // Canonicalize the shuffle with any horizontal ops inputs.
20175 // NOTE: This may update Ops and Mask.
20176 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
20177 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
20178 return DAG.getBitcast(VT, HOp);
20179
20180 V1 = DAG.getBitcast(VT, Ops[0]);
20181 V2 = DAG.getBitcast(VT, Ops[1]);
20182 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))
20183 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))
20184 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))
;
20185
20186 // Commute the shuffle if it will improve canonicalization.
20187 if (canonicalizeShuffleMaskWithCommute(Mask)) {
20188 ShuffleVectorSDNode::commuteMask(Mask);
20189 std::swap(V1, V2);
20190 }
20191
20192 // For each vector width, delegate to a specialized lowering routine.
20193 if (VT.is128BitVector())
20194 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20195
20196 if (VT.is256BitVector())
20197 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20198
20199 if (VT.is512BitVector())
20200 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20201
20202 if (Is1BitVector)
20203 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20204
20205 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20205)
;
20206}
20207
20208/// Try to lower a VSELECT instruction to a vector shuffle.
20209static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
20210 const X86Subtarget &Subtarget,
20211 SelectionDAG &DAG) {
20212 SDValue Cond = Op.getOperand(0);
20213 SDValue LHS = Op.getOperand(1);
20214 SDValue RHS = Op.getOperand(2);
20215 MVT VT = Op.getSimpleValueType();
20216
20217 // Only non-legal VSELECTs reach this lowering, convert those into generic
20218 // shuffles and re-use the shuffle lowering path for blends.
20219 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
20220 SmallVector<int, 32> Mask;
20221 if (createShuffleMaskFromVSELECT(Mask, Cond))
20222 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
20223 }
20224
20225 return SDValue();
20226}
20227
20228SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
20229 SDValue Cond = Op.getOperand(0);
20230 SDValue LHS = Op.getOperand(1);
20231 SDValue RHS = Op.getOperand(2);
20232
20233 SDLoc dl(Op);
20234 MVT VT = Op.getSimpleValueType();
20235 if (isSoftFP16(VT)) {
20236 MVT NVT = VT.changeVectorElementTypeToInteger();
20237 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
20238 DAG.getBitcast(NVT, LHS),
20239 DAG.getBitcast(NVT, RHS)));
20240 }
20241
20242 // A vselect where all conditions and data are constants can be optimized into
20243 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
20244 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
20245 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
20246 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
20247 return SDValue();
20248
20249 // Try to lower this to a blend-style vector shuffle. This can handle all
20250 // constant condition cases.
20251 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
20252 return BlendOp;
20253
20254 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
20255 // with patterns on the mask registers on AVX-512.
20256 MVT CondVT = Cond.getSimpleValueType();
20257 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
20258 if (CondEltSize == 1)
20259 return Op;
20260
20261 // Variable blends are only legal from SSE4.1 onward.
20262 if (!Subtarget.hasSSE41())
20263 return SDValue();
20264
20265 unsigned EltSize = VT.getScalarSizeInBits();
20266 unsigned NumElts = VT.getVectorNumElements();
20267
20268 // Expand v32i16/v64i8 without BWI.
20269 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
20270 return SDValue();
20271
20272 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
20273 // into an i1 condition so that we can use the mask-based 512-bit blend
20274 // instructions.
20275 if (VT.getSizeInBits() == 512) {
20276 // Build a mask by testing the condition against zero.
20277 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
20278 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
20279 DAG.getConstant(0, dl, CondVT),
20280 ISD::SETNE);
20281 // Now return a new VSELECT using the mask.
20282 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
20283 }
20284
20285 // SEXT/TRUNC cases where the mask doesn't match the destination size.
20286 if (CondEltSize != EltSize) {
20287 // If we don't have a sign splat, rely on the expansion.
20288 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
20289 return SDValue();
20290
20291 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
20292 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
20293 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
20294 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
20295 }
20296
20297 // Only some types will be legal on some subtargets. If we can emit a legal
20298 // VSELECT-matching blend, return Op, and but if we need to expand, return
20299 // a null value.
20300 switch (VT.SimpleTy) {
20301 default:
20302 // Most of the vector types have blends past SSE4.1.
20303 return Op;
20304
20305 case MVT::v32i8:
20306 // The byte blends for AVX vectors were introduced only in AVX2.
20307 if (Subtarget.hasAVX2())
20308 return Op;
20309
20310 return SDValue();
20311
20312 case MVT::v8i16:
20313 case MVT::v16i16: {
20314 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
20315 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
20316 Cond = DAG.getBitcast(CastVT, Cond);
20317 LHS = DAG.getBitcast(CastVT, LHS);
20318 RHS = DAG.getBitcast(CastVT, RHS);
20319 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
20320 return DAG.getBitcast(VT, Select);
20321 }
20322 }
20323}
20324
20325static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
20326 MVT VT = Op.getSimpleValueType();
20327 SDValue Vec = Op.getOperand(0);
20328 SDValue Idx = Op.getOperand(1);
20329 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20329, __extension__
__PRETTY_FUNCTION__))
;
20330 SDLoc dl(Op);
20331
20332 if (!Vec.getSimpleValueType().is128BitVector())
20333 return SDValue();
20334
20335 if (VT.getSizeInBits() == 8) {
20336 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
20337 // we're going to zero extend the register or fold the store.
20338 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
20339 !X86::mayFoldIntoStore(Op))
20340 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
20341 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20342 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20343
20344 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
20345 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
20346 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20347 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20348 }
20349
20350 if (VT == MVT::f32) {
20351 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
20352 // the result back to FR32 register. It's only worth matching if the
20353 // result has a single use which is a store or a bitcast to i32. And in
20354 // the case of a store, it's not worth it if the index is a constant 0,
20355 // because a MOVSSmr can be used instead, which is smaller and faster.
20356 if (!Op.hasOneUse())
20357 return SDValue();
20358 SDNode *User = *Op.getNode()->use_begin();
20359 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
20360 (User->getOpcode() != ISD::BITCAST ||
20361 User->getValueType(0) != MVT::i32))
20362 return SDValue();
20363 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20364 DAG.getBitcast(MVT::v4i32, Vec), Idx);
20365 return DAG.getBitcast(MVT::f32, Extract);
20366 }
20367
20368 if (VT == MVT::i32 || VT == MVT::i64)
20369 return Op;
20370
20371 return SDValue();
20372}
20373
20374/// Extract one bit from mask vector, like v16i1 or v8i1.
20375/// AVX-512 feature.
20376static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
20377 const X86Subtarget &Subtarget) {
20378 SDValue Vec = Op.getOperand(0);
20379 SDLoc dl(Vec);
20380 MVT VecVT = Vec.getSimpleValueType();
20381 SDValue Idx = Op.getOperand(1);
20382 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20383 MVT EltVT = Op.getSimpleValueType();
20384
20385 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__
__PRETTY_FUNCTION__))
20386 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__
__PRETTY_FUNCTION__))
;
20387
20388 // variable index can't be handled in mask registers,
20389 // extend vector to VR512/128
20390 if (!IdxC) {
20391 unsigned NumElts = VecVT.getVectorNumElements();
20392 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
20393 // than extending to 128/256bit.
20394 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20395 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20396 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
20397 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
20398 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
20399 }
20400
20401 unsigned IdxVal = IdxC->getZExtValue();
20402 if (IdxVal == 0) // the operation is legal
20403 return Op;
20404
20405 // Extend to natively supported kshift.
20406 unsigned NumElems = VecVT.getVectorNumElements();
20407 MVT WideVecVT = VecVT;
20408 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20409 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20410 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20411 DAG.getUNDEF(WideVecVT), Vec,
20412 DAG.getIntPtrConstant(0, dl));
20413 }
20414
20415 // Use kshiftr instruction to move to the lower element.
20416 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20417 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20418
20419 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20420 DAG.getIntPtrConstant(0, dl));
20421}
20422
20423SDValue
20424X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
20425 SelectionDAG &DAG) const {
20426 SDLoc dl(Op);
20427 SDValue Vec = Op.getOperand(0);
20428 MVT VecVT = Vec.getSimpleValueType();
20429 SDValue Idx = Op.getOperand(1);
20430 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20431
20432 if (VecVT.getVectorElementType() == MVT::i1)
20433 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
20434
20435 if (!IdxC) {
20436 // Its more profitable to go through memory (1 cycles throughput)
20437 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
20438 // IACA tool was used to get performance estimation
20439 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
20440 //
20441 // example : extractelement <16 x i8> %a, i32 %i
20442 //
20443 // Block Throughput: 3.00 Cycles
20444 // Throughput Bottleneck: Port5
20445 //
20446 // | Num Of | Ports pressure in cycles | |
20447 // | Uops | 0 - DV | 5 | 6 | 7 | |
20448 // ---------------------------------------------
20449 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
20450 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
20451 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
20452 // Total Num Of Uops: 4
20453 //
20454 //
20455 // Block Throughput: 1.00 Cycles
20456 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
20457 //
20458 // | | Ports pressure in cycles | |
20459 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
20460 // ---------------------------------------------------------
20461 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
20462 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
20463 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
20464 // Total Num Of Uops: 4
20465
20466 return SDValue();
20467 }
20468
20469 unsigned IdxVal = IdxC->getZExtValue();
20470
20471 // If this is a 256-bit vector result, first extract the 128-bit vector and
20472 // then extract the element from the 128-bit vector.
20473 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
20474 // Get the 128-bit vector.
20475 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
20476 MVT EltVT = VecVT.getVectorElementType();
20477
20478 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
20479 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20479, __extension__
__PRETTY_FUNCTION__))
;
20480
20481 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
20482 // this can be done with a mask.
20483 IdxVal &= ElemsPerChunk - 1;
20484 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20485 DAG.getIntPtrConstant(IdxVal, dl));
20486 }
20487
20488 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20488, __extension__
__PRETTY_FUNCTION__))
;
20489
20490 MVT VT = Op.getSimpleValueType();
20491
20492 if (VT == MVT::i16) {
20493 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
20494 // we're going to zero extend the register or fold the store (SSE41 only).
20495 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
20496 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
20497 if (Subtarget.hasFP16())
20498 return Op;
20499
20500 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
20501 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20502 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20503 }
20504
20505 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
20506 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20507 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20508 }
20509
20510 if (Subtarget.hasSSE41())
20511 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
20512 return Res;
20513
20514 // TODO: We only extract a single element from v16i8, we can probably afford
20515 // to be more aggressive here before using the default approach of spilling to
20516 // stack.
20517 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
20518 // Extract either the lowest i32 or any i16, and extract the sub-byte.
20519 int DWordIdx = IdxVal / 4;
20520 if (DWordIdx == 0) {
20521 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20522 DAG.getBitcast(MVT::v4i32, Vec),
20523 DAG.getIntPtrConstant(DWordIdx, dl));
20524 int ShiftVal = (IdxVal % 4) * 8;
20525 if (ShiftVal != 0)
20526 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
20527 DAG.getConstant(ShiftVal, dl, MVT::i8));
20528 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20529 }
20530
20531 int WordIdx = IdxVal / 2;
20532 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
20533 DAG.getBitcast(MVT::v8i16, Vec),
20534 DAG.getIntPtrConstant(WordIdx, dl));
20535 int ShiftVal = (IdxVal % 2) * 8;
20536 if (ShiftVal != 0)
20537 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
20538 DAG.getConstant(ShiftVal, dl, MVT::i8));
20539 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20540 }
20541
20542 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
20543 if (IdxVal == 0)
20544 return Op;
20545
20546 // Shuffle the element to the lowest element, then movss or movsh.
20547 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
20548 Mask[0] = static_cast<int>(IdxVal);
20549 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20550 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20551 DAG.getIntPtrConstant(0, dl));
20552 }
20553
20554 if (VT.getSizeInBits() == 64) {
20555 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
20556 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
20557 // to match extract_elt for f64.
20558 if (IdxVal == 0)
20559 return Op;
20560
20561 // UNPCKHPD the element to the lowest double word, then movsd.
20562 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
20563 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
20564 int Mask[2] = { 1, -1 };
20565 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20566 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20567 DAG.getIntPtrConstant(0, dl));
20568 }
20569
20570 return SDValue();
20571}
20572
20573/// Insert one bit to mask vector, like v16i1 or v8i1.
20574/// AVX-512 feature.
20575static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
20576 const X86Subtarget &Subtarget) {
20577 SDLoc dl(Op);
20578 SDValue Vec = Op.getOperand(0);
20579 SDValue Elt = Op.getOperand(1);
20580 SDValue Idx = Op.getOperand(2);
20581 MVT VecVT = Vec.getSimpleValueType();
20582
20583 if (!isa<ConstantSDNode>(Idx)) {
20584 // Non constant index. Extend source and destination,
20585 // insert element and then truncate the result.
20586 unsigned NumElts = VecVT.getVectorNumElements();
20587 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20588 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20589 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
20590 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
20591 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
20592 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
20593 }
20594
20595 // Copy into a k-register, extract to v1i1 and insert_subvector.
20596 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
20597 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
20598}
20599
20600SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
20601 SelectionDAG &DAG) const {
20602 MVT VT = Op.getSimpleValueType();
20603 MVT EltVT = VT.getVectorElementType();
20604 unsigned NumElts = VT.getVectorNumElements();
20605 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
20606
20607 if (EltVT == MVT::i1)
20608 return InsertBitToMaskVector(Op, DAG, Subtarget);
20609
20610 SDLoc dl(Op);
20611 SDValue N0 = Op.getOperand(0);
20612 SDValue N1 = Op.getOperand(1);
20613 SDValue N2 = Op.getOperand(2);
20614 auto *N2C = dyn_cast<ConstantSDNode>(N2);
20615
20616 if (!N2C) {
20617 // Variable insertion indices, usually we're better off spilling to stack,
20618 // but AVX512 can use a variable compare+select by comparing against all
20619 // possible vector indices, and FP insertion has less gpr->simd traffic.
20620 if (!(Subtarget.hasBWI() ||
20621 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
20622 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
20623 return SDValue();
20624
20625 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
20626 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
20627 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
20628 return SDValue();
20629
20630 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
20631 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
20632 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
20633
20634 SmallVector<SDValue, 16> RawIndices;
20635 for (unsigned I = 0; I != NumElts; ++I)
20636 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
20637 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
20638
20639 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
20640 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
20641 ISD::CondCode::SETEQ);
20642 }
20643
20644 if (N2C->getAPIntValue().uge(NumElts))
20645 return SDValue();
20646 uint64_t IdxVal = N2C->getZExtValue();
20647
20648 bool IsZeroElt = X86::isZeroNode(N1);
20649 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
20650
20651 if (IsZeroElt || IsAllOnesElt) {
20652 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
20653 // We don't deal with i8 0 since it appears to be handled elsewhere.
20654 if (IsAllOnesElt &&
20655 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
20656 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
20657 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
20658 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
20659 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
20660 CstVectorElts[IdxVal] = OnesCst;
20661 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
20662 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
20663 }
20664 // See if we can do this more efficiently with a blend shuffle with a
20665 // rematerializable vector.
20666 if (Subtarget.hasSSE41() &&
20667 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
20668 SmallVector<int, 8> BlendMask;
20669 for (unsigned i = 0; i != NumElts; ++i)
20670 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20671 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
20672 : getOnesVector(VT, DAG, dl);
20673 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
20674 }
20675 }
20676
20677 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
20678 // into that, and then insert the subvector back into the result.
20679 if (VT.is256BitVector() || VT.is512BitVector()) {
20680 // With a 256-bit vector, we can insert into the zero element efficiently
20681 // using a blend if we have AVX or AVX2 and the right data type.
20682 if (VT.is256BitVector() && IdxVal == 0) {
20683 // TODO: It is worthwhile to cast integer to floating point and back
20684 // and incur a domain crossing penalty if that's what we'll end up
20685 // doing anyway after extracting to a 128-bit vector.
20686 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
20687 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
20688 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20689 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
20690 DAG.getTargetConstant(1, dl, MVT::i8));
20691 }
20692 }
20693
20694 unsigned NumEltsIn128 = 128 / EltSizeInBits;
20695 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__
__PRETTY_FUNCTION__))
20696 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__
__PRETTY_FUNCTION__))
;
20697
20698 // If we are not inserting into the low 128-bit vector chunk,
20699 // then prefer the broadcast+blend sequence.
20700 // FIXME: relax the profitability check iff all N1 uses are insertions.
20701 if (IdxVal >= NumEltsIn128 &&
20702 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
20703 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
20704 X86::mayFoldLoad(N1, Subtarget)))) {
20705 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
20706 SmallVector<int, 8> BlendMask;
20707 for (unsigned i = 0; i != NumElts; ++i)
20708 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20709 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
20710 }
20711
20712 // Get the desired 128-bit vector chunk.
20713 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
20714
20715 // Insert the element into the desired chunk.
20716 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
20717 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
20718
20719 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
20720 DAG.getIntPtrConstant(IdxIn128, dl));
20721
20722 // Insert the changed part back into the bigger vector
20723 return insert128BitVector(N0, V, IdxVal, DAG, dl);
20724 }
20725 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20725, __extension__
__PRETTY_FUNCTION__))
;
20726
20727 // This will be just movw/movd/movq/movsh/movss/movsd.
20728 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
20729 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
20730 EltVT == MVT::f16 || EltVT == MVT::i64) {
20731 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20732 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20733 }
20734
20735 // We can't directly insert an i8 or i16 into a vector, so zero extend
20736 // it to i32 first.
20737 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
20738 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
20739 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20740 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
20741 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20742 return DAG.getBitcast(VT, N1);
20743 }
20744 }
20745
20746 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
20747 // argument. SSE41 required for pinsrb.
20748 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
20749 unsigned Opc;
20750 if (VT == MVT::v8i16) {
20751 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20751, __extension__
__PRETTY_FUNCTION__))
;
20752 Opc = X86ISD::PINSRW;
20753 } else {
20754 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20754, __extension__
__PRETTY_FUNCTION__))
;
20755 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20755, __extension__
__PRETTY_FUNCTION__))
;
20756 Opc = X86ISD::PINSRB;
20757 }
20758
20759 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20759, __extension__
__PRETTY_FUNCTION__))
;
20760 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
20761 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
20762 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
20763 }
20764
20765 if (Subtarget.hasSSE41()) {
20766 if (EltVT == MVT::f32) {
20767 // Bits [7:6] of the constant are the source select. This will always be
20768 // zero here. The DAG Combiner may combine an extract_elt index into
20769 // these bits. For example (insert (extract, 3), 2) could be matched by
20770 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
20771 // Bits [5:4] of the constant are the destination select. This is the
20772 // value of the incoming immediate.
20773 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
20774 // combine either bitwise AND or insert of float 0.0 to set these bits.
20775
20776 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
20777 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
20778 // If this is an insertion of 32-bits into the low 32-bits of
20779 // a vector, we prefer to generate a blend with immediate rather
20780 // than an insertps. Blends are simpler operations in hardware and so
20781 // will always have equal or better performance than insertps.
20782 // But if optimizing for size and there's a load folding opportunity,
20783 // generate insertps because blendps does not have a 32-bit memory
20784 // operand form.
20785 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20786 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
20787 DAG.getTargetConstant(1, dl, MVT::i8));
20788 }
20789 // Create this as a scalar to vector..
20790 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20791 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
20792 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
20793 }
20794
20795 // PINSR* works with constant index.
20796 if (EltVT == MVT::i32 || EltVT == MVT::i64)
20797 return Op;
20798 }
20799
20800 return SDValue();
20801}
20802
20803static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
20804 SelectionDAG &DAG) {
20805 SDLoc dl(Op);
20806 MVT OpVT = Op.getSimpleValueType();
20807
20808 // It's always cheaper to replace a xor+movd with xorps and simplifies further
20809 // combines.
20810 if (X86::isZeroNode(Op.getOperand(0)))
20811 return getZeroVector(OpVT, Subtarget, DAG, dl);
20812
20813 // If this is a 256-bit vector result, first insert into a 128-bit
20814 // vector and then insert into the 256-bit vector.
20815 if (!OpVT.is128BitVector()) {
20816 // Insert into a 128-bit vector.
20817 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
20818 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
20819 OpVT.getVectorNumElements() / SizeFactor);
20820
20821 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
20822
20823 // Insert the 128-bit vector.
20824 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
20825 }
20826 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__
__PRETTY_FUNCTION__))
20827 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__
__PRETTY_FUNCTION__))
;
20828
20829 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
20830 // tblgen.
20831 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
20832 return Op;
20833
20834 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
20835 return DAG.getBitcast(
20836 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
20837}
20838
20839// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
20840// simple superregister reference or explicit instructions to insert
20841// the upper bits of a vector.
20842static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20843 SelectionDAG &DAG) {
20844 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20844, __extension__
__PRETTY_FUNCTION__))
;
20845
20846 return insert1BitVector(Op, DAG, Subtarget);
20847}
20848
20849static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20850 SelectionDAG &DAG) {
20851 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__
__PRETTY_FUNCTION__))
20852 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__
__PRETTY_FUNCTION__))
;
20853
20854 SDLoc dl(Op);
20855 SDValue Vec = Op.getOperand(0);
20856 uint64_t IdxVal = Op.getConstantOperandVal(1);
20857
20858 if (IdxVal == 0) // the operation is legal
20859 return Op;
20860
20861 MVT VecVT = Vec.getSimpleValueType();
20862 unsigned NumElems = VecVT.getVectorNumElements();
20863
20864 // Extend to natively supported kshift.
20865 MVT WideVecVT = VecVT;
20866 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20867 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20868 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20869 DAG.getUNDEF(WideVecVT), Vec,
20870 DAG.getIntPtrConstant(0, dl));
20871 }
20872
20873 // Shift to the LSB.
20874 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20875 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20876
20877 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
20878 DAG.getIntPtrConstant(0, dl));
20879}
20880
20881// Returns the appropriate wrapper opcode for a global reference.
20882unsigned X86TargetLowering::getGlobalWrapperKind(
20883 const GlobalValue *GV, const unsigned char OpFlags) const {
20884 // References to absolute symbols are never PC-relative.
20885 if (GV && GV->isAbsoluteSymbolRef())
20886 return X86ISD::Wrapper;
20887
20888 CodeModel::Model M = getTargetMachine().getCodeModel();
20889 if (Subtarget.isPICStyleRIPRel() &&
20890 (M == CodeModel::Small || M == CodeModel::Kernel))
20891 return X86ISD::WrapperRIP;
20892
20893 // In the medium model, functions can always be referenced RIP-relatively,
20894 // since they must be within 2GiB. This is also possible in non-PIC mode, and
20895 // shorter than the 64-bit absolute immediate that would otherwise be emitted.
20896 if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))
20897 return X86ISD::WrapperRIP;
20898
20899 // GOTPCREL references must always use RIP.
20900 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
20901 return X86ISD::WrapperRIP;
20902
20903 return X86ISD::Wrapper;
20904}
20905
20906// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
20907// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
20908// one of the above mentioned nodes. It has to be wrapped because otherwise
20909// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
20910// be used to form addressing mode. These wrapped nodes will be selected
20911// into MOV32ri.
20912SDValue
20913X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
20914 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
20915
20916 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20917 // global base reg.
20918 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20919
20920 auto PtrVT = getPointerTy(DAG.getDataLayout());
20921 SDValue Result = DAG.getTargetConstantPool(
20922 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
20923 SDLoc DL(CP);
20924 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20925 // With PIC, the address is actually $g + Offset.
20926 if (OpFlag) {
20927 Result =
20928 DAG.getNode(ISD::ADD, DL, PtrVT,
20929 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20930 }
20931
20932 return Result;
20933}
20934
20935SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
20936 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
20937
20938 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20939 // global base reg.
20940 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20941
20942 auto PtrVT = getPointerTy(DAG.getDataLayout());
20943 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
20944 SDLoc DL(JT);
20945 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20946
20947 // With PIC, the address is actually $g + Offset.
20948 if (OpFlag)
20949 Result =
20950 DAG.getNode(ISD::ADD, DL, PtrVT,
20951 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20952
20953 return Result;
20954}
20955
20956SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
20957 SelectionDAG &DAG) const {
20958 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20959}
20960
20961SDValue
20962X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
20963 // Create the TargetBlockAddressAddress node.
20964 unsigned char OpFlags =
20965 Subtarget.classifyBlockAddressReference();
20966 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
20967 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
20968 SDLoc dl(Op);
20969 auto PtrVT = getPointerTy(DAG.getDataLayout());
20970 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20971 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
20972
20973 // With PIC, the address is actually $g + Offset.
20974 if (isGlobalRelativeToPICBase(OpFlags)) {
20975 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20976 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20977 }
20978
20979 return Result;
20980}
20981
20982/// Creates target global address or external symbol nodes for calls or
20983/// other uses.
20984SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20985 bool ForCall) const {
20986 // Unpack the global address or external symbol.
20987 const SDLoc &dl = SDLoc(Op);
20988 const GlobalValue *GV = nullptr;
20989 int64_t Offset = 0;
20990 const char *ExternalSym = nullptr;
20991 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20992 GV = G->getGlobal();
20993 Offset = G->getOffset();
20994 } else {
20995 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20996 ExternalSym = ES->getSymbol();
20997 }
20998
20999 // Calculate some flags for address lowering.
21000 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
21001 unsigned char OpFlags;
21002 if (ForCall)
21003 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
21004 else
21005 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
21006 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
21007 bool NeedsLoad = isGlobalStubReference(OpFlags);
21008
21009 CodeModel::Model M = DAG.getTarget().getCodeModel();
21010 auto PtrVT = getPointerTy(DAG.getDataLayout());
21011 SDValue Result;
21012
21013 if (GV) {
21014 // Create a target global address if this is a global. If possible, fold the
21015 // offset into the global address reference. Otherwise, ADD it on later.
21016 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
21017 // allowed because if the address of foo is 0, the ELF R_X86_64_32
21018 // relocation will compute to a negative value, which is invalid.
21019 int64_t GlobalOffset = 0;
21020 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
21021 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
21022 std::swap(GlobalOffset, Offset);
21023 }
21024 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
21025 } else {
21026 // If this is not a global address, this must be an external symbol.
21027 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
21028 }
21029
21030 // If this is a direct call, avoid the wrapper if we don't need to do any
21031 // loads or adds. This allows SDAG ISel to match direct calls.
21032 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
21033 return Result;
21034
21035 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
21036
21037 // With PIC, the address is actually $g + Offset.
21038 if (HasPICReg) {
21039 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
21040 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
21041 }
21042
21043 // For globals that require a load from a stub to get the address, emit the
21044 // load.
21045 if (NeedsLoad)
21046 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
21047 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21048
21049 // If there was a non-zero offset that we didn't fold, create an explicit
21050 // addition for it.
21051 if (Offset != 0)
21052 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
21053 DAG.getConstant(Offset, dl, PtrVT));
21054
21055 return Result;
21056}
21057
21058SDValue
21059X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
21060 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
21061}
21062
21063static SDValue
21064GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
21065 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
21066 unsigned char OperandFlags, bool LocalDynamic = false) {
21067 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21068 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21069 SDLoc dl(GA);
21070 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21071 GA->getValueType(0),
21072 GA->getOffset(),
21073 OperandFlags);
21074
21075 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
21076 : X86ISD::TLSADDR;
21077
21078 if (InGlue) {
21079 SDValue Ops[] = { Chain, TGA, *InGlue };
21080 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21081 } else {
21082 SDValue Ops[] = { Chain, TGA };
21083 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21084 }
21085
21086 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
21087 MFI.setAdjustsStack(true);
21088 MFI.setHasCalls(true);
21089
21090 SDValue Glue = Chain.getValue(1);
21091 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
21092}
21093
21094// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
21095static SDValue
21096LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21097 const EVT PtrVT) {
21098 SDValue InGlue;
21099 SDLoc dl(GA); // ? function entry point might be better
21100 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21101 DAG.getNode(X86ISD::GlobalBaseReg,
21102 SDLoc(), PtrVT), InGlue);
21103 InGlue = Chain.getValue(1);
21104
21105 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
21106}
21107
21108// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
21109static SDValue
21110LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21111 const EVT PtrVT) {
21112 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21113 X86::RAX, X86II::MO_TLSGD);
21114}
21115
21116// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
21117static SDValue
21118LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21119 const EVT PtrVT) {
21120 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21121 X86::EAX, X86II::MO_TLSGD);
21122}
21123
21124static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
21125 SelectionDAG &DAG, const EVT PtrVT,
21126 bool Is64Bit, bool Is64BitLP64) {
21127 SDLoc dl(GA);
21128
21129 // Get the start address of the TLS block for this module.
21130 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
21131 .getInfo<X86MachineFunctionInfo>();
21132 MFI->incNumLocalDynamicTLSAccesses();
21133
21134 SDValue Base;
21135 if (Is64Bit) {
21136 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
21137 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
21138 X86II::MO_TLSLD, /*LocalDynamic=*/true);
21139 } else {
21140 SDValue InGlue;
21141 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21142 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
21143 InGlue = Chain.getValue(1);
21144 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
21145 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
21146 }
21147
21148 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
21149 // of Base.
21150
21151 // Build x@dtpoff.
21152 unsigned char OperandFlags = X86II::MO_DTPOFF;
21153 unsigned WrapperKind = X86ISD::Wrapper;
21154 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21155 GA->getValueType(0),
21156 GA->getOffset(), OperandFlags);
21157 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21158
21159 // Add x@dtpoff with the base.
21160 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
21161}
21162
21163// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
21164static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21165 const EVT PtrVT, TLSModel::Model model,
21166 bool is64Bit, bool isPIC) {
21167 SDLoc dl(GA);
21168
21169 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
21170 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
21171 is64Bit ? 257 : 256));
21172
21173 SDValue ThreadPointer =
21174 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
21175 MachinePointerInfo(Ptr));
21176
21177 unsigned char OperandFlags = 0;
21178 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
21179 // initialexec.
21180 unsigned WrapperKind = X86ISD::Wrapper;
21181 if (model == TLSModel::LocalExec) {
21182 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
21183 } else if (model == TLSModel::InitialExec) {
21184 if (is64Bit) {
21185 OperandFlags = X86II::MO_GOTTPOFF;
21186 WrapperKind = X86ISD::WrapperRIP;
21187 } else {
21188 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
21189 }
21190 } else {
21191 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21191)
;
21192 }
21193
21194 // emit "addl x@ntpoff,%eax" (local exec)
21195 // or "addl x@indntpoff,%eax" (initial exec)
21196 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
21197 SDValue TGA =
21198 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
21199 GA->getOffset(), OperandFlags);
21200 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21201
21202 if (model == TLSModel::InitialExec) {
21203 if (isPIC && !is64Bit) {
21204 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
21205 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21206 Offset);
21207 }
21208
21209 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
21210 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21211 }
21212
21213 // The address of the thread local variable is the add of the thread
21214 // pointer with the offset of the variable.
21215 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
21216}
21217
21218SDValue
21219X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
21220
21221 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
21222
21223 if (DAG.getTarget().useEmulatedTLS())
21224 return LowerToTLSEmulatedModel(GA, DAG);
21225
21226 const GlobalValue *GV = GA->getGlobal();
21227 auto PtrVT = getPointerTy(DAG.getDataLayout());
21228 bool PositionIndependent = isPositionIndependent();
21229
21230 if (Subtarget.isTargetELF()) {
21231 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
21232 switch (model) {
21233 case TLSModel::GeneralDynamic:
21234 if (Subtarget.is64Bit()) {
21235 if (Subtarget.isTarget64BitLP64())
21236 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
21237 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
21238 }
21239 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
21240 case TLSModel::LocalDynamic:
21241 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
21242 Subtarget.isTarget64BitLP64());
21243 case TLSModel::InitialExec:
21244 case TLSModel::LocalExec:
21245 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
21246 PositionIndependent);
21247 }
21248 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21248)
;
21249 }
21250
21251 if (Subtarget.isTargetDarwin()) {
21252 // Darwin only has one model of TLS. Lower to that.
21253 unsigned char OpFlag = 0;
21254 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
21255 X86ISD::WrapperRIP : X86ISD::Wrapper;
21256
21257 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
21258 // global base reg.
21259 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
21260 if (PIC32)
21261 OpFlag = X86II::MO_TLVP_PIC_BASE;
21262 else
21263 OpFlag = X86II::MO_TLVP;
21264 SDLoc DL(Op);
21265 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
21266 GA->getValueType(0),
21267 GA->getOffset(), OpFlag);
21268 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
21269
21270 // With PIC32, the address is actually $g + Offset.
21271 if (PIC32)
21272 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
21273 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21274 Offset);
21275
21276 // Lowering the machine isd will make sure everything is in the right
21277 // location.
21278 SDValue Chain = DAG.getEntryNode();
21279 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21280 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
21281 SDValue Args[] = { Chain, Offset };
21282 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
21283 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
21284
21285 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
21286 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21287 MFI.setAdjustsStack(true);
21288
21289 // And our return value (tls address) is in the standard call return value
21290 // location.
21291 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
21292 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
21293 }
21294
21295 if (Subtarget.isOSWindows()) {
21296 // Just use the implicit TLS architecture
21297 // Need to generate something similar to:
21298 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
21299 // ; from TEB
21300 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
21301 // mov rcx, qword [rdx+rcx*8]
21302 // mov eax, .tls$:tlsvar
21303 // [rax+rcx] contains the address
21304 // Windows 64bit: gs:0x58
21305 // Windows 32bit: fs:__tls_array
21306
21307 SDLoc dl(GA);
21308 SDValue Chain = DAG.getEntryNode();
21309
21310 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
21311 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
21312 // use its literal value of 0x2C.
21313 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
21314 ? Type::getInt8PtrTy(*DAG.getContext(),
21315 256)
21316 : Type::getInt32PtrTy(*DAG.getContext(),
21317 257));
21318
21319 SDValue TlsArray = Subtarget.is64Bit()
21320 ? DAG.getIntPtrConstant(0x58, dl)
21321 : (Subtarget.isTargetWindowsGNU()
21322 ? DAG.getIntPtrConstant(0x2C, dl)
21323 : DAG.getExternalSymbol("_tls_array", PtrVT));
21324
21325 SDValue ThreadPointer =
21326 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
21327
21328 SDValue res;
21329 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
21330 res = ThreadPointer;
21331 } else {
21332 // Load the _tls_index variable
21333 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
21334 if (Subtarget.is64Bit())
21335 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
21336 MachinePointerInfo(), MVT::i32);
21337 else
21338 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
21339
21340 const DataLayout &DL = DAG.getDataLayout();
21341 SDValue Scale =
21342 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
21343 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
21344
21345 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
21346 }
21347
21348 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
21349
21350 // Get the offset of start of .tls section
21351 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21352 GA->getValueType(0),
21353 GA->getOffset(), X86II::MO_SECREL);
21354 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
21355
21356 // The address of the thread local variable is the add of the thread
21357 // pointer with the offset of the variable.
21358 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
21359 }
21360
21361 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21361)
;
21362}
21363
21364/// Lower SRA_PARTS and friends, which return two i32 values
21365/// and take a 2 x i32 value to shift plus a shift amount.
21366/// TODO: Can this be moved to general expansion code?
21367static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
21368 SDValue Lo, Hi;
21369 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
21370 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
21371}
21372
21373// Try to use a packed vector operation to handle i64 on 32-bit targets when
21374// AVX512DQ is enabled.
21375static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
21376 const X86Subtarget &Subtarget) {
21377 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21378 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21379 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21380 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21381 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
;
21382 bool IsStrict = Op->isStrictFPOpcode();
21383 unsigned OpNo = IsStrict ? 1 : 0;
21384 SDValue Src = Op.getOperand(OpNo);
21385 MVT SrcVT = Src.getSimpleValueType();
21386 MVT VT = Op.getSimpleValueType();
21387
21388 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
21389 (VT != MVT::f32 && VT != MVT::f64))
21390 return SDValue();
21391
21392 // Pack the i64 into a vector, do the operation and extract.
21393
21394 // Using 256-bit to ensure result is 128-bits for f32 case.
21395 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
21396 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
21397 MVT VecVT = MVT::getVectorVT(VT, NumElts);
21398
21399 SDLoc dl(Op);
21400 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
21401 if (IsStrict) {
21402 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
21403 {Op.getOperand(0), InVec});
21404 SDValue Chain = CvtVec.getValue(1);
21405 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21406 DAG.getIntPtrConstant(0, dl));
21407 return DAG.getMergeValues({Value, Chain}, dl);
21408 }
21409
21410 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
21411
21412 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21413 DAG.getIntPtrConstant(0, dl));
21414}
21415
21416// Try to use a packed vector operation to handle i64 on 32-bit targets.
21417static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
21418 const X86Subtarget &Subtarget) {
21419 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21420 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21421 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21422 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21423 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
;
21424 bool IsStrict = Op->isStrictFPOpcode();
21425 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21426 MVT SrcVT = Src.getSimpleValueType();
21427 MVT VT = Op.getSimpleValueType();
21428
21429 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
21430 return SDValue();
21431
21432 // Pack the i64 into a vector, do the operation and extract.
21433
21434 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21434, __extension__
__PRETTY_FUNCTION__))
;
21435
21436 SDLoc dl(Op);
21437 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
21438 if (IsStrict) {
21439 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
21440 {Op.getOperand(0), InVec});
21441 SDValue Chain = CvtVec.getValue(1);
21442 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21443 DAG.getIntPtrConstant(0, dl));
21444 return DAG.getMergeValues({Value, Chain}, dl);
21445 }
21446
21447 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
21448
21449 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21450 DAG.getIntPtrConstant(0, dl));
21451}
21452
21453static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
21454 const X86Subtarget &Subtarget) {
21455 switch (Opcode) {
21456 case ISD::SINT_TO_FP:
21457 // TODO: Handle wider types with AVX/AVX512.
21458 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
21459 return false;
21460 // CVTDQ2PS or (V)CVTDQ2PD
21461 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
21462
21463 case ISD::UINT_TO_FP:
21464 // TODO: Handle wider types and i64 elements.
21465 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
21466 return false;
21467 // VCVTUDQ2PS or VCVTUDQ2PD
21468 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
21469
21470 default:
21471 return false;
21472 }
21473}
21474
21475/// Given a scalar cast operation that is extracted from a vector, try to
21476/// vectorize the cast op followed by extraction. This will avoid an expensive
21477/// round-trip between XMM and GPR.
21478static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
21479 const X86Subtarget &Subtarget) {
21480 // TODO: This could be enhanced to handle smaller integer types by peeking
21481 // through an extend.
21482 SDValue Extract = Cast.getOperand(0);
21483 MVT DestVT = Cast.getSimpleValueType();
21484 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21485 !isa<ConstantSDNode>(Extract.getOperand(1)))
21486 return SDValue();
21487
21488 // See if we have a 128-bit vector cast op for this type of cast.
21489 SDValue VecOp = Extract.getOperand(0);
21490 MVT FromVT = VecOp.getSimpleValueType();
21491 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
21492 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
21493 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
21494 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
21495 return SDValue();
21496
21497 // If we are extracting from a non-zero element, first shuffle the source
21498 // vector to allow extracting from element zero.
21499 SDLoc DL(Cast);
21500 if (!isNullConstant(Extract.getOperand(1))) {
21501 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
21502 Mask[0] = Extract.getConstantOperandVal(1);
21503 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
21504 }
21505 // If the source vector is wider than 128-bits, extract the low part. Do not
21506 // create an unnecessarily wide vector cast op.
21507 if (FromVT != Vec128VT)
21508 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
21509
21510 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
21511 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
21512 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
21513 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
21514 DAG.getIntPtrConstant(0, DL));
21515}
21516
21517/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
21518/// try to vectorize the cast ops. This will avoid an expensive round-trip
21519/// between XMM and GPR.
21520static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
21521 const X86Subtarget &Subtarget) {
21522 // TODO: Allow FP_TO_UINT.
21523 SDValue CastToInt = CastToFP.getOperand(0);
21524 MVT VT = CastToFP.getSimpleValueType();
21525 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
21526 return SDValue();
21527
21528 MVT IntVT = CastToInt.getSimpleValueType();
21529 SDValue X = CastToInt.getOperand(0);
21530 MVT SrcVT = X.getSimpleValueType();
21531 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
21532 return SDValue();
21533
21534 // See if we have 128-bit vector cast instructions for this type of cast.
21535 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
21536 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
21537 IntVT != MVT::i32)
21538 return SDValue();
21539
21540 unsigned SrcSize = SrcVT.getSizeInBits();
21541 unsigned IntSize = IntVT.getSizeInBits();
21542 unsigned VTSize = VT.getSizeInBits();
21543 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
21544 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
21545 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
21546
21547 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
21548 unsigned ToIntOpcode =
21549 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
21550 unsigned ToFPOpcode =
21551 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
21552
21553 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
21554 //
21555 // We are not defining the high elements (for example, zero them) because
21556 // that could nullify any performance advantage that we hoped to gain from
21557 // this vector op hack. We do not expect any adverse effects (like denorm
21558 // penalties) with cast ops.
21559 SDLoc DL(CastToFP);
21560 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
21561 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
21562 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
21563 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
21564 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
21565}
21566
21567static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
21568 const X86Subtarget &Subtarget) {
21569 SDLoc DL(Op);
21570 bool IsStrict = Op->isStrictFPOpcode();
21571 MVT VT = Op->getSimpleValueType(0);
21572 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
21573
21574 if (Subtarget.hasDQI()) {
21575 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21575, __extension__
__PRETTY_FUNCTION__))
;
21576
21577 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))
21578 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))
21579 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))
;
21580
21581 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
21582 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__
__PRETTY_FUNCTION__))
21583 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__
__PRETTY_FUNCTION__))
;
21584 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21585
21586 // Need to concat with zero vector for strict fp to avoid spurious
21587 // exceptions.
21588 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
21589 : DAG.getUNDEF(MVT::v8i64);
21590 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
21591 DAG.getIntPtrConstant(0, DL));
21592 SDValue Res, Chain;
21593 if (IsStrict) {
21594 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
21595 {Op->getOperand(0), Src});
21596 Chain = Res.getValue(1);
21597 } else {
21598 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
21599 }
21600
21601 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21602 DAG.getIntPtrConstant(0, DL));
21603
21604 if (IsStrict)
21605 return DAG.getMergeValues({Res, Chain}, DL);
21606 return Res;
21607 }
21608
21609 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
21610 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
21611 if (VT != MVT::v4f32 || IsSigned)
21612 return SDValue();
21613
21614 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
21615 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
21616 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
21617 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
21618 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
21619 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
21620 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
21621 SmallVector<SDValue, 4> SignCvts(4);
21622 SmallVector<SDValue, 4> Chains(4);
21623 for (int i = 0; i != 4; ++i) {
21624 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
21625 DAG.getIntPtrConstant(i, DL));
21626 if (IsStrict) {
21627 SignCvts[i] =
21628 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
21629 {Op.getOperand(0), Elt});
21630 Chains[i] = SignCvts[i].getValue(1);
21631 } else {
21632 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
21633 }
21634 }
21635 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
21636
21637 SDValue Slow, Chain;
21638 if (IsStrict) {
21639 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
21640 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
21641 {Chain, SignCvt, SignCvt});
21642 Chain = Slow.getValue(1);
21643 } else {
21644 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
21645 }
21646
21647 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
21648 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
21649
21650 if (IsStrict)
21651 return DAG.getMergeValues({Cvt, Chain}, DL);
21652
21653 return Cvt;
21654}
21655
21656static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
21657 bool IsStrict = Op->isStrictFPOpcode();
21658 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21659 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21660 MVT VT = Op.getSimpleValueType();
21661 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21662 SDLoc dl(Op);
21663
21664 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
21665 if (IsStrict)
21666 return DAG.getNode(
21667 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
21668 {Chain,
21669 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
21670 Rnd});
21671 return DAG.getNode(ISD::FP_ROUND, dl, VT,
21672 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
21673}
21674
21675static bool isLegalConversion(MVT VT, bool IsSigned,
21676 const X86Subtarget &Subtarget) {
21677 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
21678 return true;
21679 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
21680 return true;
21681 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
21682 return true;
21683 if (Subtarget.useAVX512Regs()) {
21684 if (VT == MVT::v16i32)
21685 return true;
21686 if (VT == MVT::v8i64 && Subtarget.hasDQI())
21687 return true;
21688 }
21689 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
21690 (VT == MVT::v2i64 || VT == MVT::v4i64))
21691 return true;
21692 return false;
21693}
21694
21695SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
21696 SelectionDAG &DAG) const {
21697 bool IsStrict = Op->isStrictFPOpcode();
21698 unsigned OpNo = IsStrict ? 1 : 0;
21699 SDValue Src = Op.getOperand(OpNo);
21700 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21701 MVT SrcVT = Src.getSimpleValueType();
21702 MVT VT = Op.getSimpleValueType();
21703 SDLoc dl(Op);
21704
21705 if (isSoftFP16(VT))
21706 return promoteXINT_TO_FP(Op, DAG);
21707 else if (isLegalConversion(SrcVT, true, Subtarget))
21708 return Op;
21709
21710 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21711 return LowerWin64_INT128_TO_FP(Op, DAG);
21712
21713 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21714 return Extract;
21715
21716 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
21717 return R;
21718
21719 if (SrcVT.isVector()) {
21720 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
21721 // Note: Since v2f64 is a legal type. We don't need to zero extend the
21722 // source for strict FP.
21723 if (IsStrict)
21724 return DAG.getNode(
21725 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
21726 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21727 DAG.getUNDEF(SrcVT))});
21728 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
21729 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21730 DAG.getUNDEF(SrcVT)));
21731 }
21732 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
21733 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21734
21735 return SDValue();
21736 }
21737
21738 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__
__PRETTY_FUNCTION__))
21739 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__
__PRETTY_FUNCTION__))
;
21740
21741 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
21742
21743 // These are really Legal; return the operand so the caller accepts it as
21744 // Legal.
21745 if (SrcVT == MVT::i32 && UseSSEReg)
21746 return Op;
21747 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
21748 return Op;
21749
21750 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21751 return V;
21752 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21753 return V;
21754
21755 // SSE doesn't have an i16 conversion so we need to promote.
21756 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
21757 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
21758 if (IsStrict)
21759 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
21760 {Chain, Ext});
21761
21762 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
21763 }
21764
21765 if (VT == MVT::f128 || !Subtarget.hasX87())
21766 return SDValue();
21767
21768 SDValue ValueToStore = Src;
21769 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
21770 // Bitcasting to f64 here allows us to do a single 64-bit store from
21771 // an SSE register, avoiding the store forwarding penalty that would come
21772 // with two 32-bit stores.
21773 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21774
21775 unsigned Size = SrcVT.getStoreSize();
21776 Align Alignment(Size);
21777 MachineFunction &MF = DAG.getMachineFunction();
21778 auto PtrVT = getPointerTy(MF.getDataLayout());
21779 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
21780 MachinePointerInfo MPI =
21781 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21782 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21783 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
21784 std::pair<SDValue, SDValue> Tmp =
21785 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
21786
21787 if (IsStrict)
21788 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21789
21790 return Tmp.first;
21791}
21792
21793std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
21794 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
21795 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
21796 // Build the FILD
21797 SDVTList Tys;
21798 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
21799 if (useSSE)
21800 Tys = DAG.getVTList(MVT::f80, MVT::Other);
21801 else
21802 Tys = DAG.getVTList(DstVT, MVT::Other);
21803
21804 SDValue FILDOps[] = {Chain, Pointer};
21805 SDValue Result =
21806 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
21807 Alignment, MachineMemOperand::MOLoad);
21808 Chain = Result.getValue(1);
21809
21810 if (useSSE) {
21811 MachineFunction &MF = DAG.getMachineFunction();
21812 unsigned SSFISize = DstVT.getStoreSize();
21813 int SSFI =
21814 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
21815 auto PtrVT = getPointerTy(MF.getDataLayout());
21816 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21817 Tys = DAG.getVTList(MVT::Other);
21818 SDValue FSTOps[] = {Chain, Result, StackSlot};
21819 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
21820 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
21821 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
21822
21823 Chain =
21824 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
21825 Result = DAG.getLoad(
21826 DstVT, DL, Chain, StackSlot,
21827 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
21828 Chain = Result.getValue(1);
21829 }
21830
21831 return { Result, Chain };
21832}
21833
21834/// Horizontal vector math instructions may be slower than normal math with
21835/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
21836/// implementation, and likely shuffle complexity of the alternate sequence.
21837static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
21838 const X86Subtarget &Subtarget) {
21839 bool IsOptimizingSize = DAG.shouldOptForSize();
21840 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
21841 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
21842}
21843
21844/// 64-bit unsigned integer to double expansion.
21845static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
21846 const X86Subtarget &Subtarget) {
21847 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
21848 // when converting 0 when rounding toward negative infinity. Caller will
21849 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
21850 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21850, __extension__
__PRETTY_FUNCTION__))
;
21851 // This algorithm is not obvious. Here it is what we're trying to output:
21852 /*
21853 movq %rax, %xmm0
21854 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
21855 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
21856 #ifdef __SSE3__
21857 haddpd %xmm0, %xmm0
21858 #else
21859 pshufd $0x4e, %xmm0, %xmm1
21860 addpd %xmm1, %xmm0
21861 #endif
21862 */
21863
21864 SDLoc dl(Op);
21865 LLVMContext *Context = DAG.getContext();
21866
21867 // Build some magic constants.
21868 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21869 Constant *C0 = ConstantDataVector::get(*Context, CV0);
21870 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21871 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
21872
21873 SmallVector<Constant*,2> CV1;
21874 CV1.push_back(
21875 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21876 APInt(64, 0x4330000000000000ULL))));
21877 CV1.push_back(
21878 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21879 APInt(64, 0x4530000000000000ULL))));
21880 Constant *C1 = ConstantVector::get(CV1);
21881 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
21882
21883 // Load the 64-bit value into an XMM register.
21884 SDValue XR1 =
21885 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
21886 SDValue CLod0 = DAG.getLoad(
21887 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
21888 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21889 SDValue Unpck1 =
21890 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
21891
21892 SDValue CLod1 = DAG.getLoad(
21893 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
21894 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21895 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
21896 // TODO: Are there any fast-math-flags to propagate here?
21897 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
21898 SDValue Result;
21899
21900 if (Subtarget.hasSSE3() &&
21901 shouldUseHorizontalOp(true, DAG, Subtarget)) {
21902 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
21903 } else {
21904 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
21905 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
21906 }
21907 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
21908 DAG.getIntPtrConstant(0, dl));
21909 return Result;
21910}
21911
21912/// 32-bit unsigned integer to float expansion.
21913static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
21914 const X86Subtarget &Subtarget) {
21915 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21916 SDLoc dl(Op);
21917 // FP constant to bias correct the final result.
21918 SDValue Bias = DAG.getConstantFP(
21919 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
21920
21921 // Load the 32-bit value into an XMM register.
21922 SDValue Load =
21923 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
21924
21925 // Zero out the upper parts of the register.
21926 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
21927
21928 // Or the load with the bias.
21929 SDValue Or = DAG.getNode(
21930 ISD::OR, dl, MVT::v2i64,
21931 DAG.getBitcast(MVT::v2i64, Load),
21932 DAG.getBitcast(MVT::v2i64,
21933 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
21934 Or =
21935 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
21936 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
21937
21938 if (Op.getNode()->isStrictFPOpcode()) {
21939 // Subtract the bias.
21940 // TODO: Are there any fast-math-flags to propagate here?
21941 SDValue Chain = Op.getOperand(0);
21942 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
21943 {Chain, Or, Bias});
21944
21945 if (Op.getValueType() == Sub.getValueType())
21946 return Sub;
21947
21948 // Handle final rounding.
21949 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
21950 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
21951
21952 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
21953 }
21954
21955 // Subtract the bias.
21956 // TODO: Are there any fast-math-flags to propagate here?
21957 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21958
21959 // Handle final rounding.
21960 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21961}
21962
21963static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
21964 const X86Subtarget &Subtarget,
21965 const SDLoc &DL) {
21966 if (Op.getSimpleValueType() != MVT::v2f64)
21967 return SDValue();
21968
21969 bool IsStrict = Op->isStrictFPOpcode();
21970
21971 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21972 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21972, __extension__
__PRETTY_FUNCTION__))
;
21973
21974 if (Subtarget.hasAVX512()) {
21975 if (!Subtarget.hasVLX()) {
21976 // Let generic type legalization widen this.
21977 if (!IsStrict)
21978 return SDValue();
21979 // Otherwise pad the integer input with 0s and widen the operation.
21980 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21981 DAG.getConstant(0, DL, MVT::v2i32));
21982 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21983 {Op.getOperand(0), N0});
21984 SDValue Chain = Res.getValue(1);
21985 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21986 DAG.getIntPtrConstant(0, DL));
21987 return DAG.getMergeValues({Res, Chain}, DL);
21988 }
21989
21990 // Legalize to v4i32 type.
21991 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21992 DAG.getUNDEF(MVT::v2i32));
21993 if (IsStrict)
21994 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21995 {Op.getOperand(0), N0});
21996 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21997 }
21998
21999 // Zero extend to 2i64, OR with the floating point representation of 2^52.
22000 // This gives us the floating point equivalent of 2^52 + the i32 integer
22001 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
22002 // point leaving just our i32 integers in double format.
22003 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
22004 SDValue VBias = DAG.getConstantFP(
22005 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
22006 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
22007 DAG.getBitcast(MVT::v2i64, VBias));
22008 Or = DAG.getBitcast(MVT::v2f64, Or);
22009
22010 if (IsStrict)
22011 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
22012 {Op.getOperand(0), Or, VBias});
22013 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
22014}
22015
22016static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
22017 const X86Subtarget &Subtarget) {
22018 SDLoc DL(Op);
22019 bool IsStrict = Op->isStrictFPOpcode();
22020 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
22021 MVT VecIntVT = V.getSimpleValueType();
22022 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__
__PRETTY_FUNCTION__))
22023 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__
__PRETTY_FUNCTION__))
;
22024
22025 if (Subtarget.hasAVX512()) {
22026 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
22027 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22027, __extension__
__PRETTY_FUNCTION__))
;
22028 MVT VT = Op->getSimpleValueType(0);
22029
22030 // v8i32->v8f64 is legal with AVX512 so just return it.
22031 if (VT == MVT::v8f64)
22032 return Op;
22033
22034 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__
__PRETTY_FUNCTION__))
22035 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__
__PRETTY_FUNCTION__))
;
22036 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
22037 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
22038 // Need to concat with zero vector for strict fp to avoid spurious
22039 // exceptions.
22040 SDValue Tmp =
22041 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
22042 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
22043 DAG.getIntPtrConstant(0, DL));
22044 SDValue Res, Chain;
22045 if (IsStrict) {
22046 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
22047 {Op->getOperand(0), V});
22048 Chain = Res.getValue(1);
22049 } else {
22050 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
22051 }
22052
22053 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
22054 DAG.getIntPtrConstant(0, DL));
22055
22056 if (IsStrict)
22057 return DAG.getMergeValues({Res, Chain}, DL);
22058 return Res;
22059 }
22060
22061 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
22062 Op->getSimpleValueType(0) == MVT::v4f64) {
22063 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
22064 Constant *Bias = ConstantFP::get(
22065 *DAG.getContext(),
22066 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
22067 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
22068 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
22069 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
22070 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
22071 SDValue VBias = DAG.getMemIntrinsicNode(
22072 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
22073 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
22074 MachineMemOperand::MOLoad);
22075
22076 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
22077 DAG.getBitcast(MVT::v4i64, VBias));
22078 Or = DAG.getBitcast(MVT::v4f64, Or);
22079
22080 if (IsStrict)
22081 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
22082 {Op.getOperand(0), Or, VBias});
22083 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
22084 }
22085
22086 // The algorithm is the following:
22087 // #ifdef __SSE4_1__
22088 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22089 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22090 // (uint4) 0x53000000, 0xaa);
22091 // #else
22092 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22093 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
22094 // #endif
22095 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22096 // return (float4) lo + fhi;
22097
22098 bool Is128 = VecIntVT == MVT::v4i32;
22099 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
22100 // If we convert to something else than the supported type, e.g., to v4f64,
22101 // abort early.
22102 if (VecFloatVT != Op->getSimpleValueType(0))
22103 return SDValue();
22104
22105 // In the #idef/#else code, we have in common:
22106 // - The vector of constants:
22107 // -- 0x4b000000
22108 // -- 0x53000000
22109 // - A shift:
22110 // -- v >> 16
22111
22112 // Create the splat vector for 0x4b000000.
22113 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
22114 // Create the splat vector for 0x53000000.
22115 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
22116
22117 // Create the right shift.
22118 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
22119 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
22120
22121 SDValue Low, High;
22122 if (Subtarget.hasSSE41()) {
22123 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
22124 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22125 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
22126 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
22127 // Low will be bitcasted right away, so do not bother bitcasting back to its
22128 // original type.
22129 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
22130 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22131 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22132 // (uint4) 0x53000000, 0xaa);
22133 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
22134 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
22135 // High will be bitcasted right away, so do not bother bitcasting back to
22136 // its original type.
22137 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
22138 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22139 } else {
22140 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
22141 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22142 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
22143 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
22144
22145 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
22146 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
22147 }
22148
22149 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
22150 SDValue VecCstFSub = DAG.getConstantFP(
22151 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
22152
22153 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22154 // NOTE: By using fsub of a positive constant instead of fadd of a negative
22155 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
22156 // enabled. See PR24512.
22157 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
22158 // TODO: Are there any fast-math-flags to propagate here?
22159 // (float4) lo;
22160 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
22161 // return (float4) lo + fhi;
22162 if (IsStrict) {
22163 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
22164 {Op.getOperand(0), HighBitcast, VecCstFSub});
22165 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
22166 {FHigh.getValue(1), LowBitcast, FHigh});
22167 }
22168
22169 SDValue FHigh =
22170 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
22171 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
22172}
22173
22174static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
22175 const X86Subtarget &Subtarget) {
22176 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
22177 SDValue N0 = Op.getOperand(OpNo);
22178 MVT SrcVT = N0.getSimpleValueType();
22179 SDLoc dl(Op);
22180
22181 switch (SrcVT.SimpleTy) {
22182 default:
22183 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22183)
;
22184 case MVT::v2i32:
22185 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
22186 case MVT::v4i32:
22187 case MVT::v8i32:
22188 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
22189 case MVT::v2i64:
22190 case MVT::v4i64:
22191 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
22192 }
22193}
22194
22195SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
22196 SelectionDAG &DAG) const {
22197 bool IsStrict = Op->isStrictFPOpcode();
22198 unsigned OpNo = IsStrict ? 1 : 0;
22199 SDValue Src = Op.getOperand(OpNo);
22200 SDLoc dl(Op);
22201 auto PtrVT = getPointerTy(DAG.getDataLayout());
22202 MVT SrcVT = Src.getSimpleValueType();
22203 MVT DstVT = Op->getSimpleValueType(0);
22204 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22205
22206 // Bail out when we don't have native conversion instructions.
22207 if (DstVT == MVT::f128)
22208 return SDValue();
22209
22210 if (isSoftFP16(DstVT))
22211 return promoteXINT_TO_FP(Op, DAG);
22212 else if (isLegalConversion(SrcVT, false, Subtarget))
22213 return Op;
22214
22215 if (DstVT.isVector())
22216 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
22217
22218 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
22219 return LowerWin64_INT128_TO_FP(Op, DAG);
22220
22221 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
22222 return Extract;
22223
22224 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
22225 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
22226 // Conversions from unsigned i32 to f32/f64 are legal,
22227 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
22228 return Op;
22229 }
22230
22231 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
22232 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
22233 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
22234 if (IsStrict)
22235 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
22236 {Chain, Src});
22237 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
22238 }
22239
22240 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
22241 return V;
22242 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
22243 return V;
22244
22245 // The transform for i64->f64 isn't correct for 0 when rounding to negative
22246 // infinity. It produces -0.0, so disable under strictfp.
22247 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
22248 !IsStrict)
22249 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
22250 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
22251 // negative infinity. So disable under strictfp. Using FILD instead.
22252 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
22253 !IsStrict)
22254 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
22255 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
22256 (DstVT == MVT::f32 || DstVT == MVT::f64))
22257 return SDValue();
22258
22259 // Make a 64-bit buffer, and use it to build an FILD.
22260 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
22261 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
22262 Align SlotAlign(8);
22263 MachinePointerInfo MPI =
22264 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
22265 if (SrcVT == MVT::i32) {
22266 SDValue OffsetSlot =
22267 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
22268 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
22269 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
22270 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
22271 std::pair<SDValue, SDValue> Tmp =
22272 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
22273 if (IsStrict)
22274 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
22275
22276 return Tmp.first;
22277 }
22278
22279 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22279, __extension__
__PRETTY_FUNCTION__))
;
22280 SDValue ValueToStore = Src;
22281 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
22282 // Bitcasting to f64 here allows us to do a single 64-bit store from
22283 // an SSE register, avoiding the store forwarding penalty that would come
22284 // with two 32-bit stores.
22285 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
22286 }
22287 SDValue Store =
22288 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
22289 // For i64 source, we need to add the appropriate power of 2 if the input
22290 // was negative. We must be careful to do the computation in x87 extended
22291 // precision, not in SSE.
22292 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22293 SDValue Ops[] = { Store, StackSlot };
22294 SDValue Fild =
22295 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
22296 SlotAlign, MachineMemOperand::MOLoad);
22297 Chain = Fild.getValue(1);
22298
22299
22300 // Check whether the sign bit is set.
22301 SDValue SignSet = DAG.getSetCC(
22302 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
22303 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
22304
22305 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
22306 APInt FF(64, 0x5F80000000000000ULL);
22307 SDValue FudgePtr = DAG.getConstantPool(
22308 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
22309 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
22310
22311 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
22312 SDValue Zero = DAG.getIntPtrConstant(0, dl);
22313 SDValue Four = DAG.getIntPtrConstant(4, dl);
22314 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
22315 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
22316
22317 // Load the value out, extending it from f32 to f80.
22318 SDValue Fudge = DAG.getExtLoad(
22319 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
22320 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
22321 CPAlignment);
22322 Chain = Fudge.getValue(1);
22323 // Extend everything to 80 bits to force it to be done on x87.
22324 // TODO: Are there any fast-math-flags to propagate here?
22325 if (IsStrict) {
22326 unsigned Opc = ISD::STRICT_FADD;
22327 // Windows needs the precision control changed to 80bits around this add.
22328 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22329 Opc = X86ISD::STRICT_FP80_ADD;
22330
22331 SDValue Add =
22332 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
22333 // STRICT_FP_ROUND can't handle equal types.
22334 if (DstVT == MVT::f80)
22335 return Add;
22336 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
22337 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
22338 }
22339 unsigned Opc = ISD::FADD;
22340 // Windows needs the precision control changed to 80bits around this add.
22341 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22342 Opc = X86ISD::FP80_ADD;
22343
22344 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
22345 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
22346 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22347}
22348
22349// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
22350// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
22351// just return an SDValue().
22352// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
22353// to i16, i32 or i64, and we lower it to a legal sequence and return the
22354// result.
22355SDValue
22356X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
22357 bool IsSigned, SDValue &Chain) const {
22358 bool IsStrict = Op->isStrictFPOpcode();
22359 SDLoc DL(Op);
22360
22361 EVT DstTy = Op.getValueType();
22362 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
22363 EVT TheVT = Value.getValueType();
22364 auto PtrVT = getPointerTy(DAG.getDataLayout());
22365
22366 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
22367 // f16 must be promoted before using the lowering in this routine.
22368 // fp128 does not use this lowering.
22369 return SDValue();
22370 }
22371
22372 // If using FIST to compute an unsigned i64, we'll need some fixup
22373 // to handle values above the maximum signed i64. A FIST is always
22374 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
22375 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
22376
22377 // FIXME: This does not generate an invalid exception if the input does not
22378 // fit in i32. PR44019
22379 if (!IsSigned && DstTy != MVT::i64) {
22380 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
22381 // The low 32 bits of the fist result will have the correct uint32 result.
22382 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22382, __extension__
__PRETTY_FUNCTION__))
;
22383 DstTy = MVT::i64;
22384 }
22385
22386 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))
22387 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))
22388 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))
;
22389
22390 // We lower FP->int64 into FISTP64 followed by a load from a temporary
22391 // stack slot.
22392 MachineFunction &MF = DAG.getMachineFunction();
22393 unsigned MemSize = DstTy.getStoreSize();
22394 int SSFI =
22395 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
22396 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
22397
22398 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22399
22400 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
22401
22402 if (UnsignedFixup) {
22403 //
22404 // Conversion to unsigned i64 is implemented with a select,
22405 // depending on whether the source value fits in the range
22406 // of a signed i64. Let Thresh be the FP equivalent of
22407 // 0x8000000000000000ULL.
22408 //
22409 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
22410 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
22411 // FistSrc = (Value - FltOfs);
22412 // Fist-to-mem64 FistSrc
22413 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
22414 // to XOR'ing the high 32 bits with Adjust.
22415 //
22416 // Being a power of 2, Thresh is exactly representable in all FP formats.
22417 // For X87 we'd like to use the smallest FP type for this constant, but
22418 // for DAG type consistency we have to match the FP operand type.
22419
22420 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
22421 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
22422 bool LosesInfo = false;
22423 if (TheVT == MVT::f64)
22424 // The rounding mode is irrelevant as the conversion should be exact.
22425 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
22426 &LosesInfo);
22427 else if (TheVT == MVT::f80)
22428 Status = Thresh.convert(APFloat::x87DoubleExtended(),
22429 APFloat::rmNearestTiesToEven, &LosesInfo);
22430
22431 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__
__PRETTY_FUNCTION__))
22432 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__
__PRETTY_FUNCTION__))
;
22433
22434 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
22435
22436 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
22437 *DAG.getContext(), TheVT);
22438 SDValue Cmp;
22439 if (IsStrict) {
22440 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
22441 /*IsSignaling*/ true);
22442 Chain = Cmp.getValue(1);
22443 } else {
22444 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
22445 }
22446
22447 // Our preferred lowering of
22448 //
22449 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
22450 //
22451 // is
22452 //
22453 // (Value >= Thresh) << 63
22454 //
22455 // but since we can get here after LegalOperations, DAGCombine might do the
22456 // wrong thing if we create a select. So, directly create the preferred
22457 // version.
22458 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
22459 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
22460 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
22461
22462 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
22463 DAG.getConstantFP(0.0, DL, TheVT));
22464
22465 if (IsStrict) {
22466 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
22467 { Chain, Value, FltOfs });
22468 Chain = Value.getValue(1);
22469 } else
22470 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
22471 }
22472
22473 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
22474
22475 // FIXME This causes a redundant load/store if the SSE-class value is already
22476 // in memory, such as if it is on the callstack.
22477 if (isScalarFPTypeInSSEReg(TheVT)) {
22478 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22478, __extension__
__PRETTY_FUNCTION__))
;
22479 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
22480 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22481 SDValue Ops[] = { Chain, StackSlot };
22482
22483 unsigned FLDSize = TheVT.getStoreSize();
22484 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22484, __extension__
__PRETTY_FUNCTION__))
;
22485 MachineMemOperand *MMO = MF.getMachineMemOperand(
22486 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
22487 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
22488 Chain = Value.getValue(1);
22489 }
22490
22491 // Build the FP_TO_INT*_IN_MEM
22492 MachineMemOperand *MMO = MF.getMachineMemOperand(
22493 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
22494 SDValue Ops[] = { Chain, Value, StackSlot };
22495 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
22496 DAG.getVTList(MVT::Other),
22497 Ops, DstTy, MMO);
22498
22499 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
22500 Chain = Res.getValue(1);
22501
22502 // If we need an unsigned fixup, XOR the result with adjust.
22503 if (UnsignedFixup)
22504 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
22505
22506 return Res;
22507}
22508
22509static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
22510 const X86Subtarget &Subtarget) {
22511 MVT VT = Op.getSimpleValueType();
22512 SDValue In = Op.getOperand(0);
22513 MVT InVT = In.getSimpleValueType();
22514 SDLoc dl(Op);
22515 unsigned Opc = Op.getOpcode();
22516
22517 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22517, __extension__
__PRETTY_FUNCTION__))
;
22518 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__
__PRETTY_FUNCTION__))
22519 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__
__PRETTY_FUNCTION__))
;
22520 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__
__PRETTY_FUNCTION__))
22521 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__
__PRETTY_FUNCTION__))
;
22522 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
22523 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
22524 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
22525 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
;
22526 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
22527 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
22528 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
22529 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
;
22530
22531 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
22532
22533 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
22534 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22534, __extension__
__PRETTY_FUNCTION__))
;
22535 return splitVectorIntUnary(Op, DAG);
22536 }
22537
22538 if (Subtarget.hasInt256())
22539 return Op;
22540
22541 // Optimize vectors in AVX mode:
22542 //
22543 // v8i16 -> v8i32
22544 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
22545 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
22546 // Concat upper and lower parts.
22547 //
22548 // v4i32 -> v4i64
22549 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
22550 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
22551 // Concat upper and lower parts.
22552 //
22553 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22554 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
22555
22556 // Short-circuit if we can determine that each 128-bit half is the same value.
22557 // Otherwise, this is difficult to match and optimize.
22558 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
22559 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
22560 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
22561
22562 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
22563 SDValue Undef = DAG.getUNDEF(InVT);
22564 bool NeedZero = Opc == ISD::ZERO_EXTEND;
22565 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
22566 OpHi = DAG.getBitcast(HalfVT, OpHi);
22567
22568 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22569}
22570
22571// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
22572static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
22573 const SDLoc &dl, SelectionDAG &DAG) {
22574 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22574, __extension__
__PRETTY_FUNCTION__))
;
22575 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22576 DAG.getIntPtrConstant(0, dl));
22577 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22578 DAG.getIntPtrConstant(8, dl));
22579 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
22580 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
22581 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
22582 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22583}
22584
22585static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
22586 const X86Subtarget &Subtarget,
22587 SelectionDAG &DAG) {
22588 MVT VT = Op->getSimpleValueType(0);
22589 SDValue In = Op->getOperand(0);
22590 MVT InVT = In.getSimpleValueType();
22591 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22591, __extension__
__PRETTY_FUNCTION__))
;
22592 SDLoc DL(Op);
22593 unsigned NumElts = VT.getVectorNumElements();
22594
22595 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
22596 // avoids a constant pool load.
22597 if (VT.getVectorElementType() != MVT::i8) {
22598 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
22599 return DAG.getNode(ISD::SRL, DL, VT, Extend,
22600 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
22601 }
22602
22603 // Extend VT if BWI is not supported.
22604 MVT ExtVT = VT;
22605 if (!Subtarget.hasBWI()) {
22606 // If v16i32 is to be avoided, we'll need to split and concatenate.
22607 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22608 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
22609
22610 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22611 }
22612
22613 // Widen to 512-bits if VLX is not supported.
22614 MVT WideVT = ExtVT;
22615 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22616 NumElts *= 512 / ExtVT.getSizeInBits();
22617 InVT = MVT::getVectorVT(MVT::i1, NumElts);
22618 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
22619 In, DAG.getIntPtrConstant(0, DL));
22620 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
22621 NumElts);
22622 }
22623
22624 SDValue One = DAG.getConstant(1, DL, WideVT);
22625 SDValue Zero = DAG.getConstant(0, DL, WideVT);
22626
22627 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
22628
22629 // Truncate if we had to extend above.
22630 if (VT != ExtVT) {
22631 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
22632 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
22633 }
22634
22635 // Extract back to 128/256-bit if we widened.
22636 if (WideVT != VT)
22637 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
22638 DAG.getIntPtrConstant(0, DL));
22639
22640 return SelectedVal;
22641}
22642
22643static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22644 SelectionDAG &DAG) {
22645 SDValue In = Op.getOperand(0);
22646 MVT SVT = In.getSimpleValueType();
22647
22648 if (SVT.getVectorElementType() == MVT::i1)
22649 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
22650
22651 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22651, __extension__
__PRETTY_FUNCTION__))
;
22652 return LowerAVXExtend(Op, DAG, Subtarget);
22653}
22654
22655/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
22656/// It makes use of the fact that vectors with enough leading sign/zero bits
22657/// prevent the PACKSS/PACKUS from saturating the results.
22658/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
22659/// within each 128-bit lane.
22660static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
22661 const SDLoc &DL, SelectionDAG &DAG,
22662 const X86Subtarget &Subtarget) {
22663 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__
__PRETTY_FUNCTION__))
22664 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__
__PRETTY_FUNCTION__))
;
22665 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22665, __extension__
__PRETTY_FUNCTION__))
;
22666
22667 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
22668 if (!Subtarget.hasSSE2())
22669 return SDValue();
22670
22671 EVT SrcVT = In.getValueType();
22672
22673 // No truncation required, we might get here due to recursive calls.
22674 if (SrcVT == DstVT)
22675 return In;
22676
22677 // We only support vector truncation to 64bits or greater from a
22678 // 128bits or greater source.
22679 unsigned DstSizeInBits = DstVT.getSizeInBits();
22680 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
22681 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
22682 return SDValue();
22683
22684 unsigned NumElems = SrcVT.getVectorNumElements();
22685 if (!isPowerOf2_32(NumElems))
22686 return SDValue();
22687
22688 LLVMContext &Ctx = *DAG.getContext();
22689 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22689, __extension__
__PRETTY_FUNCTION__))
;
22690 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22690, __extension__
__PRETTY_FUNCTION__))
;
22691
22692 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
22693
22694 // Pack to the largest type possible:
22695 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
22696 EVT InVT = MVT::i16, OutVT = MVT::i8;
22697 if (SrcVT.getScalarSizeInBits() > 16 &&
22698 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
22699 InVT = MVT::i32;
22700 OutVT = MVT::i16;
22701 }
22702
22703 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
22704 if (SrcVT.is128BitVector()) {
22705 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
22706 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
22707 In = DAG.getBitcast(InVT, In);
22708 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
22709 Res = extractSubVector(Res, 0, DAG, DL, 64);
22710 return DAG.getBitcast(DstVT, Res);
22711 }
22712
22713 // Split lower/upper subvectors.
22714 SDValue Lo, Hi;
22715 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
22716
22717 unsigned SubSizeInBits = SrcSizeInBits / 2;
22718 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
22719 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
22720
22721 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
22722 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
22723 Lo = DAG.getBitcast(InVT, Lo);
22724 Hi = DAG.getBitcast(InVT, Hi);
22725 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22726 return DAG.getBitcast(DstVT, Res);
22727 }
22728
22729 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
22730 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
22731 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
22732 Lo = DAG.getBitcast(InVT, Lo);
22733 Hi = DAG.getBitcast(InVT, Hi);
22734 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22735
22736 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
22737 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
22738 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
22739 SmallVector<int, 64> Mask;
22740 int Scale = 64 / OutVT.getScalarSizeInBits();
22741 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
22742 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
22743
22744 if (DstVT.is256BitVector())
22745 return DAG.getBitcast(DstVT, Res);
22746
22747 // If 512bit -> 128bit truncate another stage.
22748 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22749 Res = DAG.getBitcast(PackedVT, Res);
22750 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22751 }
22752
22753 // Recursively pack lower/upper subvectors, concat result and pack again.
22754 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22754, __extension__
__PRETTY_FUNCTION__))
;
22755 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
22756 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
22757 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
22758
22759 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22760 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
22761 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22762}
22763
22764static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
22765 const X86Subtarget &Subtarget) {
22766
22767 SDLoc DL(Op);
22768 MVT VT = Op.getSimpleValueType();
22769 SDValue In = Op.getOperand(0);
22770 MVT InVT = In.getSimpleValueType();
22771
22772 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22772, __extension__
__PRETTY_FUNCTION__))
;
22773
22774 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
22775 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
22776 if (InVT.getScalarSizeInBits() <= 16) {
22777 if (Subtarget.hasBWI()) {
22778 // legal, will go to VPMOVB2M, VPMOVW2M
22779 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22780 // We need to shift to get the lsb into sign position.
22781 // Shift packed bytes not supported natively, bitcast to word
22782 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
22783 In = DAG.getNode(ISD::SHL, DL, ExtVT,
22784 DAG.getBitcast(ExtVT, In),
22785 DAG.getConstant(ShiftInx, DL, ExtVT));
22786 In = DAG.getBitcast(InVT, In);
22787 }
22788 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
22789 In, ISD::SETGT);
22790 }
22791 // Use TESTD/Q, extended vector to packed dword/qword.
22792 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__
__PRETTY_FUNCTION__))
22793 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__
__PRETTY_FUNCTION__))
;
22794 unsigned NumElts = InVT.getVectorNumElements();
22795 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__))
;
22796 // We need to change to a wider element type that we have support for.
22797 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
22798 // For 16 element vectors we extend to v16i32 unless we are explicitly
22799 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
22800 // we need to split into two 8 element vectors which we can extend to v8i32,
22801 // truncate and concat the results. There's an additional complication if
22802 // the original type is v16i8. In that case we can't split the v16i8
22803 // directly, so we need to shuffle high elements to low and use
22804 // sign_extend_vector_inreg.
22805 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
22806 SDValue Lo, Hi;
22807 if (InVT == MVT::v16i8) {
22808 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
22809 Hi = DAG.getVectorShuffle(
22810 InVT, DL, In, In,
22811 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
22812 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
22813 } else {
22814 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22814, __extension__
__PRETTY_FUNCTION__))
;
22815 Lo = extract128BitVector(In, 0, DAG, DL);
22816 Hi = extract128BitVector(In, 8, DAG, DL);
22817 }
22818 // We're split now, just emit two truncates and a concat. The two
22819 // truncates will trigger legalization to come back to this function.
22820 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
22821 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
22822 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22823 }
22824 // We either have 8 elements or we're allowed to use 512-bit vectors.
22825 // If we have VLX, we want to use the narrowest vector that can get the
22826 // job done so we use vXi32.
22827 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
22828 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
22829 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
22830 InVT = ExtVT;
22831 ShiftInx = InVT.getScalarSizeInBits() - 1;
22832 }
22833
22834 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22835 // We need to shift to get the lsb into sign position.
22836 In = DAG.getNode(ISD::SHL, DL, InVT, In,
22837 DAG.getConstant(ShiftInx, DL, InVT));
22838 }
22839 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
22840 if (Subtarget.hasDQI())
22841 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
22842 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
22843}
22844
22845SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
22846 SDLoc DL(Op);
22847 MVT VT = Op.getSimpleValueType();
22848 SDValue In = Op.getOperand(0);
22849 MVT InVT = In.getSimpleValueType();
22850 unsigned InNumEltBits = InVT.getScalarSizeInBits();
22851
22852 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__
__PRETTY_FUNCTION__))
22853 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__
__PRETTY_FUNCTION__))
;
22854
22855 // If we're called by the type legalizer, handle a few cases.
22856 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22857 if (!TLI.isTypeLegal(InVT)) {
22858 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
22859 VT.is128BitVector()) {
22860 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__
__PRETTY_FUNCTION__))
22861 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__
__PRETTY_FUNCTION__))
;
22862 // The default behavior is to truncate one step, concatenate, and then
22863 // truncate the remainder. We'd rather produce two 64-bit results and
22864 // concatenate those.
22865 SDValue Lo, Hi;
22866 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
22867
22868 EVT LoVT, HiVT;
22869 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22870
22871 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
22872 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
22873 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22874 }
22875
22876 // Otherwise let default legalization handle it.
22877 return SDValue();
22878 }
22879
22880 if (VT.getVectorElementType() == MVT::i1)
22881 return LowerTruncateVecI1(Op, DAG, Subtarget);
22882
22883 // vpmovqb/w/d, vpmovdb/w, vpmovwb
22884 if (Subtarget.hasAVX512()) {
22885 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
22886 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22886, __extension__
__PRETTY_FUNCTION__))
;
22887 return splitVectorIntUnary(Op, DAG);
22888 }
22889
22890 // word to byte only under BWI. Otherwise we have to promoted to v16i32
22891 // and then truncate that. But we should only do that if we haven't been
22892 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
22893 // handled by isel patterns.
22894 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
22895 Subtarget.canExtendTo512DQ())
22896 return Op;
22897 }
22898
22899 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
22900 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22901
22902 // Truncate with PACKUS if we are truncating a vector with leading zero bits
22903 // that extend all the way to the packed/truncated value.
22904 // Pre-SSE41 we can only use PACKUSWB.
22905 KnownBits Known = DAG.computeKnownBits(In);
22906 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
22907 if (SDValue V =
22908 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
22909 return V;
22910
22911 // Truncate with PACKSS if we are truncating a vector with sign-bits that
22912 // extend all the way to the packed/truncated value.
22913 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
22914 if (SDValue V =
22915 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
22916 return V;
22917
22918 // Handle truncation of V256 to V128 using shuffles.
22919 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22919, __extension__
__PRETTY_FUNCTION__))
;
22920
22921 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
22922 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
22923 if (Subtarget.hasInt256()) {
22924 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
22925 In = DAG.getBitcast(MVT::v8i32, In);
22926 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
22927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
22928 DAG.getIntPtrConstant(0, DL));
22929 }
22930
22931 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22932 DAG.getIntPtrConstant(0, DL));
22933 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22934 DAG.getIntPtrConstant(2, DL));
22935 static const int ShufMask[] = {0, 2, 4, 6};
22936 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
22937 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
22938 }
22939
22940 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
22941 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
22942 if (Subtarget.hasInt256()) {
22943 // The PSHUFB mask:
22944 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
22945 -1, -1, -1, -1, -1, -1, -1, -1,
22946 16, 17, 20, 21, 24, 25, 28, 29,
22947 -1, -1, -1, -1, -1, -1, -1, -1 };
22948 In = DAG.getBitcast(MVT::v32i8, In);
22949 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
22950 In = DAG.getBitcast(MVT::v4i64, In);
22951
22952 static const int ShufMask2[] = {0, 2, -1, -1};
22953 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
22954 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22955 DAG.getIntPtrConstant(0, DL));
22956 return DAG.getBitcast(MVT::v8i16, In);
22957 }
22958
22959 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22960 DAG.getIntPtrConstant(0, DL));
22961 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22962 DAG.getIntPtrConstant(4, DL));
22963
22964 // The PSHUFB mask:
22965 static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
22966
22967 OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
22968 OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
22969
22970 OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
22971 OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
22972
22973 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
22974 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
22975
22976 // The MOVLHPS Mask:
22977 static const int ShufMask2[] = {0, 1, 4, 5};
22978 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
22979 return DAG.getBitcast(MVT::v8i16, res);
22980 }
22981
22982 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
22983 // Use an AND to zero uppper bits for PACKUS.
22984 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
22985
22986 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22987 DAG.getIntPtrConstant(0, DL));
22988 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22989 DAG.getIntPtrConstant(8, DL));
22990 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
22991 }
22992
22993 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22993)
;
22994}
22995
22996// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
22997// behaves on out of range inputs to generate optimized conversions.
22998static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
22999 SelectionDAG &DAG,
23000 const X86Subtarget &Subtarget) {
23001 MVT SrcVT = Src.getSimpleValueType();
23002 unsigned DstBits = VT.getScalarSizeInBits();
23003 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23003, __extension__
__PRETTY_FUNCTION__))
;
23004
23005 // Calculate the converted result for values in the range 0 to
23006 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23007 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
23008 SDValue Big =
23009 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
23010 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
23011 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
23012
23013 // The "CVTTP2SI" instruction conveniently sets the sign bit if
23014 // and only if the value was out of range. So we can use that
23015 // as our indicator that we rather use "Big" instead of "Small".
23016 //
23017 // Use "Small" if "IsOverflown" has all bits cleared
23018 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23019
23020 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
23021 // use the slightly slower blendv select instead.
23022 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
23023 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
23024 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
23025 }
23026
23027 SDValue IsOverflown =
23028 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
23029 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
23030 return DAG.getNode(ISD::OR, dl, VT, Small,
23031 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23032}
23033
23034SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
23035 bool IsStrict = Op->isStrictFPOpcode();
23036 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
23037 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
23038 MVT VT = Op->getSimpleValueType(0);
23039 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23040 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
23041 MVT SrcVT = Src.getSimpleValueType();
23042 SDLoc dl(Op);
23043
23044 SDValue Res;
23045 if (isSoftFP16(SrcVT)) {
23046 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
23047 if (IsStrict)
23048 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
23049 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
23050 {NVT, MVT::Other}, {Chain, Src})});
23051 return DAG.getNode(Op.getOpcode(), dl, VT,
23052 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
23053 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
23054 return Op;
23055 }
23056
23057 if (VT.isVector()) {
23058 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
23059 MVT ResVT = MVT::v4i32;
23060 MVT TruncVT = MVT::v4i1;
23061 unsigned Opc;
23062 if (IsStrict)
23063 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
23064 else
23065 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23066
23067 if (!IsSigned && !Subtarget.hasVLX()) {
23068 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23068, __extension__
__PRETTY_FUNCTION__))
;
23069 // Widen to 512-bits.
23070 ResVT = MVT::v8i32;
23071 TruncVT = MVT::v8i1;
23072 Opc = Op.getOpcode();
23073 // Need to concat with zero vector for strict fp to avoid spurious
23074 // exceptions.
23075 // TODO: Should we just do this for non-strict as well?
23076 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
23077 : DAG.getUNDEF(MVT::v8f64);
23078 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
23079 DAG.getIntPtrConstant(0, dl));
23080 }
23081 if (IsStrict) {
23082 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
23083 Chain = Res.getValue(1);
23084 } else {
23085 Res = DAG.getNode(Opc, dl, ResVT, Src);
23086 }
23087
23088 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
23089 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
23090 DAG.getIntPtrConstant(0, dl));
23091 if (IsStrict)
23092 return DAG.getMergeValues({Res, Chain}, dl);
23093 return Res;
23094 }
23095
23096 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
23097 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
23098 return Op;
23099
23100 MVT ResVT = VT;
23101 MVT EleVT = VT.getVectorElementType();
23102 if (EleVT != MVT::i64)
23103 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
23104
23105 if (SrcVT != MVT::v8f16) {
23106 SDValue Tmp =
23107 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
23108 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
23109 Ops[0] = Src;
23110 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
23111 }
23112
23113 if (IsStrict) {
23114 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
23115 : X86ISD::STRICT_CVTTP2UI,
23116 dl, {ResVT, MVT::Other}, {Chain, Src});
23117 Chain = Res.getValue(1);
23118 } else {
23119 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
23120 ResVT, Src);
23121 }
23122
23123 // TODO: Need to add exception check code for strict FP.
23124 if (EleVT.getSizeInBits() < 16) {
23125 ResVT = MVT::getVectorVT(EleVT, 8);
23126 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
23127 }
23128
23129 if (ResVT != VT)
23130 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23131 DAG.getIntPtrConstant(0, dl));
23132
23133 if (IsStrict)
23134 return DAG.getMergeValues({Res, Chain}, dl);
23135 return Res;
23136 }
23137
23138 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
23139 if (VT.getVectorElementType() == MVT::i16) {
23140 assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))
23141 SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))
23142 "Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))
;
23143 MVT NVT = VT.changeVectorElementType(MVT::i32);
23144 if (IsStrict) {
23145 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
23146 : ISD::STRICT_FP_TO_UINT,
23147 dl, {NVT, MVT::Other}, {Chain, Src});
23148 Chain = Res.getValue(1);
23149 } else {
23150 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
23151 NVT, Src);
23152 }
23153
23154 // TODO: Need to add exception check code for strict FP.
23155 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23156
23157 if (IsStrict)
23158 return DAG.getMergeValues({Res, Chain}, dl);
23159 return Res;
23160 }
23161
23162 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
23163 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
23164 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23164, __extension__
__PRETTY_FUNCTION__))
;
23165 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23165, __extension__
__PRETTY_FUNCTION__))
;
23166 return Op;
23167 }
23168
23169 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
23170 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
23171 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
23172 Subtarget.useAVX512Regs()) {
23173 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23173, __extension__
__PRETTY_FUNCTION__))
;
23174 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23174, __extension__
__PRETTY_FUNCTION__))
;
23175 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
23176 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
23177 // Need to concat with zero vector for strict fp to avoid spurious
23178 // exceptions.
23179 // TODO: Should we just do this for non-strict as well?
23180 SDValue Tmp =
23181 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23182 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23183 DAG.getIntPtrConstant(0, dl));
23184
23185 if (IsStrict) {
23186 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
23187 {Chain, Src});
23188 Chain = Res.getValue(1);
23189 } else {
23190 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
23191 }
23192
23193 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23194 DAG.getIntPtrConstant(0, dl));
23195
23196 if (IsStrict)
23197 return DAG.getMergeValues({Res, Chain}, dl);
23198 return Res;
23199 }
23200
23201 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
23202 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
23203 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
23204 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
23205 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23205, __extension__
__PRETTY_FUNCTION__))
;
23206 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
23207 // Need to concat with zero vector for strict fp to avoid spurious
23208 // exceptions.
23209 // TODO: Should we just do this for non-strict as well?
23210 SDValue Tmp =
23211 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23212 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23213 DAG.getIntPtrConstant(0, dl));
23214
23215 if (IsStrict) {
23216 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23217 {Chain, Src});
23218 Chain = Res.getValue(1);
23219 } else {
23220 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
23221 }
23222
23223 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23224 DAG.getIntPtrConstant(0, dl));
23225
23226 if (IsStrict)
23227 return DAG.getMergeValues({Res, Chain}, dl);
23228 return Res;
23229 }
23230
23231 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
23232 if (!Subtarget.hasVLX()) {
23233 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
23234 // legalizer and then widened again by vector op legalization.
23235 if (!IsStrict)
23236 return SDValue();
23237
23238 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
23239 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
23240 {Src, Zero, Zero, Zero});
23241 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23242 {Chain, Tmp});
23243 SDValue Chain = Tmp.getValue(1);
23244 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
23245 DAG.getIntPtrConstant(0, dl));
23246 return DAG.getMergeValues({Tmp, Chain}, dl);
23247 }
23248
23249 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23249, __extension__
__PRETTY_FUNCTION__))
;
23250 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23251 DAG.getUNDEF(MVT::v2f32));
23252 if (IsStrict) {
23253 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
23254 : X86ISD::STRICT_CVTTP2UI;
23255 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
23256 }
23257 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23258 return DAG.getNode(Opc, dl, VT, Tmp);
23259 }
23260
23261 // Generate optimized instructions for pre AVX512 unsigned conversions from
23262 // vXf32 to vXi32.
23263 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
23264 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
23265 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
23266 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23266, __extension__
__PRETTY_FUNCTION__))
;
23267 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
23268 }
23269
23270 return SDValue();
23271 }
23272
23273 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23273, __extension__ __PRETTY_FUNCTION__))
;
23274
23275 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
23276
23277 if (!IsSigned && UseSSEReg) {
23278 // Conversions from f32/f64 with AVX512 should be legal.
23279 if (Subtarget.hasAVX512())
23280 return Op;
23281
23282 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
23283 // behaves on out of range inputs to generate optimized conversions.
23284 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
23285 (VT == MVT::i64 && Subtarget.is64Bit()))) {
23286 unsigned DstBits = VT.getScalarSizeInBits();
23287 APInt UIntLimit = APInt::getSignMask(DstBits);
23288 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
23289 DAG.getConstant(UIntLimit, dl, VT));
23290 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
23291
23292 // Calculate the converted result for values in the range:
23293 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23294 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
23295 SDValue Small =
23296 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
23297 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
23298 SDValue Big = DAG.getNode(
23299 X86ISD::CVTTS2SI, dl, VT,
23300 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
23301 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
23302
23303 // The "CVTTS2SI" instruction conveniently sets the sign bit if
23304 // and only if the value was out of range. So we can use that
23305 // as our indicator that we rather use "Big" instead of "Small".
23306 //
23307 // Use "Small" if "IsOverflown" has all bits cleared
23308 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23309 SDValue IsOverflown = DAG.getNode(
23310 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
23311 return DAG.getNode(ISD::OR, dl, VT, Small,
23312 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23313 }
23314
23315 // Use default expansion for i64.
23316 if (VT == MVT::i64)
23317 return SDValue();
23318
23319 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23319, __extension__
__PRETTY_FUNCTION__))
;
23320
23321 // Promote i32 to i64 and use a signed operation on 64-bit targets.
23322 // FIXME: This does not generate an invalid exception if the input does not
23323 // fit in i32. PR44019
23324 if (Subtarget.is64Bit()) {
23325 if (IsStrict) {
23326 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
23327 {Chain, Src});
23328 Chain = Res.getValue(1);
23329 } else
23330 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
23331
23332 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23333 if (IsStrict)
23334 return DAG.getMergeValues({Res, Chain}, dl);
23335 return Res;
23336 }
23337
23338 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
23339 // use fisttp which will be handled later.
23340 if (!Subtarget.hasSSE3())
23341 return SDValue();
23342 }
23343
23344 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
23345 // FIXME: This does not generate an invalid exception if the input does not
23346 // fit in i16. PR44019
23347 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
23348 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23348, __extension__
__PRETTY_FUNCTION__))
;
23349 if (IsStrict) {
23350 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
23351 {Chain, Src});
23352 Chain = Res.getValue(1);
23353 } else
23354 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
23355
23356 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23357 if (IsStrict)
23358 return DAG.getMergeValues({Res, Chain}, dl);
23359 return Res;
23360 }
23361
23362 // If this is a FP_TO_SINT using SSEReg we're done.
23363 if (UseSSEReg && IsSigned)
23364 return Op;
23365
23366 // fp128 needs to use a libcall.
23367 if (SrcVT == MVT::f128) {
23368 RTLIB::Libcall LC;
23369 if (IsSigned)
23370 LC = RTLIB::getFPTOSINT(SrcVT, VT);
23371 else
23372 LC = RTLIB::getFPTOUINT(SrcVT, VT);
23373
23374 MakeLibCallOptions CallOptions;
23375 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
23376 SDLoc(Op), Chain);
23377
23378 if (IsStrict)
23379 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
23380
23381 return Tmp.first;
23382 }
23383
23384 // Fall back to X87.
23385 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
23386 if (IsStrict)
23387 return DAG.getMergeValues({V, Chain}, dl);
23388 return V;
23389 }
23390
23391 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23391)
;
23392}
23393
23394SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
23395 SelectionDAG &DAG) const {
23396 SDValue Src = Op.getOperand(0);
23397 MVT SrcVT = Src.getSimpleValueType();
23398
23399 if (SrcVT == MVT::f16)
23400 return SDValue();
23401
23402 // If the source is in an SSE register, the node is Legal.
23403 if (isScalarFPTypeInSSEReg(SrcVT))
23404 return Op;
23405
23406 return LRINT_LLRINTHelper(Op.getNode(), DAG);
23407}
23408
23409SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
23410 SelectionDAG &DAG) const {
23411 EVT DstVT = N->getValueType(0);
23412 SDValue Src = N->getOperand(0);
23413 EVT SrcVT = Src.getValueType();
23414
23415 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
23416 // f16 must be promoted before using the lowering in this routine.
23417 // fp128 does not use this lowering.
23418 return SDValue();
23419 }
23420
23421 SDLoc DL(N);
23422 SDValue Chain = DAG.getEntryNode();
23423
23424 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
23425
23426 // If we're converting from SSE, the stack slot needs to hold both types.
23427 // Otherwise it only needs to hold the DstVT.
23428 EVT OtherVT = UseSSE ? SrcVT : DstVT;
23429 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
23430 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23431 MachinePointerInfo MPI =
23432 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
23433
23434 if (UseSSE) {
23435 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23435, __extension__
__PRETTY_FUNCTION__))
;
23436 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
23437 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
23438 SDValue Ops[] = { Chain, StackPtr };
23439
23440 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
23441 /*Align*/ std::nullopt,
23442 MachineMemOperand::MOLoad);
23443 Chain = Src.getValue(1);
23444 }
23445
23446 SDValue StoreOps[] = { Chain, Src, StackPtr };
23447 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
23448 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
23449 MachineMemOperand::MOStore);
23450
23451 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
23452}
23453
23454SDValue
23455X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
23456 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
23457 // but making use of X86 specifics to produce better instruction sequences.
23458 SDNode *Node = Op.getNode();
23459 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
23460 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
23461 SDLoc dl(SDValue(Node, 0));
23462 SDValue Src = Node->getOperand(0);
23463
23464 // There are three types involved here: SrcVT is the source floating point
23465 // type, DstVT is the type of the result, and TmpVT is the result of the
23466 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
23467 // DstVT).
23468 EVT SrcVT = Src.getValueType();
23469 EVT DstVT = Node->getValueType(0);
23470 EVT TmpVT = DstVT;
23471
23472 // This code is only for floats and doubles. Fall back to generic code for
23473 // anything else.
23474 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
23475 return SDValue();
23476
23477 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
23478 unsigned SatWidth = SatVT.getScalarSizeInBits();
23479 unsigned DstWidth = DstVT.getScalarSizeInBits();
23480 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
23481 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__
__PRETTY_FUNCTION__))
23482 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__
__PRETTY_FUNCTION__))
;
23483
23484 // Promote result of FP_TO_*INT to at least 32 bits.
23485 if (TmpWidth < 32) {
23486 TmpVT = MVT::i32;
23487 TmpWidth = 32;
23488 }
23489
23490 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
23491 // us to use a native signed conversion instead.
23492 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
23493 TmpVT = MVT::i64;
23494 TmpWidth = 64;
23495 }
23496
23497 // If the saturation width is smaller than the size of the temporary result,
23498 // we can always use signed conversion, which is native.
23499 if (SatWidth < TmpWidth)
23500 FpToIntOpcode = ISD::FP_TO_SINT;
23501
23502 // Determine minimum and maximum integer values and their corresponding
23503 // floating-point values.
23504 APInt MinInt, MaxInt;
23505 if (IsSigned) {
23506 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
23507 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
23508 } else {
23509 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
23510 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
23511 }
23512
23513 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23514 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23515
23516 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
23517 MinInt, IsSigned, APFloat::rmTowardZero);
23518 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
23519 MaxInt, IsSigned, APFloat::rmTowardZero);
23520 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
23521 && !(MaxStatus & APFloat::opStatus::opInexact);
23522
23523 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
23524 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
23525
23526 // If the integer bounds are exactly representable as floats, emit a
23527 // min+max+fptoi sequence. Otherwise use comparisons and selects.
23528 if (AreExactFloatBounds) {
23529 if (DstVT != TmpVT) {
23530 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
23531 SDValue MinClamped = DAG.getNode(
23532 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
23533 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
23534 SDValue BothClamped = DAG.getNode(
23535 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
23536 // Convert clamped value to integer.
23537 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
23538
23539 // NaN will become INDVAL, with the top bit set and the rest zero.
23540 // Truncation will discard the top bit, resulting in zero.
23541 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23542 }
23543
23544 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
23545 SDValue MinClamped = DAG.getNode(
23546 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
23547 // Clamp by MaxFloat from above. NaN cannot occur.
23548 SDValue BothClamped = DAG.getNode(
23549 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
23550 // Convert clamped value to integer.
23551 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
23552
23553 if (!IsSigned) {
23554 // In the unsigned case we're done, because we mapped NaN to MinFloat,
23555 // which is zero.
23556 return FpToInt;
23557 }
23558
23559 // Otherwise, select zero if Src is NaN.
23560 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23561 return DAG.getSelectCC(
23562 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
23563 }
23564
23565 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
23566 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
23567
23568 // Result of direct conversion, which may be selected away.
23569 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
23570
23571 if (DstVT != TmpVT) {
23572 // NaN will become INDVAL, with the top bit set and the rest zero.
23573 // Truncation will discard the top bit, resulting in zero.
23574 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23575 }
23576
23577 SDValue Select = FpToInt;
23578 // For signed conversions where we saturate to the same size as the
23579 // result type of the fptoi instructions, INDVAL coincides with integer
23580 // minimum, so we don't need to explicitly check it.
23581 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
23582 // If Src ULT MinFloat, select MinInt. In particular, this also selects
23583 // MinInt if Src is NaN.
23584 Select = DAG.getSelectCC(
23585 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
23586 }
23587
23588 // If Src OGT MaxFloat, select MaxInt.
23589 Select = DAG.getSelectCC(
23590 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
23591
23592 // In the unsigned case we are done, because we mapped NaN to MinInt, which
23593 // is already zero. The promoted case was already handled above.
23594 if (!IsSigned || DstVT != TmpVT) {
23595 return Select;
23596 }
23597
23598 // Otherwise, select 0 if Src is NaN.
23599 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23600 return DAG.getSelectCC(
23601 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
23602}
23603
23604SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
23605 bool IsStrict = Op->isStrictFPOpcode();
23606
23607 SDLoc DL(Op);
23608 MVT VT = Op.getSimpleValueType();
23609 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23610 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23611 MVT SVT = In.getSimpleValueType();
23612
23613 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
23614 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
23615 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
23616 !Subtarget.getTargetTriple().isOSDarwin()))
23617 return SDValue();
23618
23619 if (SVT == MVT::f16) {
23620 if (Subtarget.hasFP16())
23621 return Op;
23622
23623 if (VT != MVT::f32) {
23624 if (IsStrict)
23625 return DAG.getNode(
23626 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
23627 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
23628 {MVT::f32, MVT::Other}, {Chain, In})});
23629
23630 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
23631 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
23632 }
23633
23634 if (!Subtarget.hasF16C()) {
23635 if (!Subtarget.getTargetTriple().isOSDarwin())
23636 return SDValue();
23637
23638 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23638, __extension__
__PRETTY_FUNCTION__))
;
23639
23640 // Need a libcall, but ABI for f16 is soft-float on MacOS.
23641 TargetLowering::CallLoweringInfo CLI(DAG);
23642 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23643
23644 In = DAG.getBitcast(MVT::i16, In);
23645 TargetLowering::ArgListTy Args;
23646 TargetLowering::ArgListEntry Entry;
23647 Entry.Node = In;
23648 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
23649 Entry.IsSExt = false;
23650 Entry.IsZExt = true;
23651 Args.push_back(Entry);
23652
23653 SDValue Callee = DAG.getExternalSymbol(
23654 getLibcallName(RTLIB::FPEXT_F16_F32),
23655 getPointerTy(DAG.getDataLayout()));
23656 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23657 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
23658 std::move(Args));
23659
23660 SDValue Res;
23661 std::tie(Res,Chain) = LowerCallTo(CLI);
23662 if (IsStrict)
23663 Res = DAG.getMergeValues({Res, Chain}, DL);
23664
23665 return Res;
23666 }
23667
23668 In = DAG.getBitcast(MVT::i16, In);
23669 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
23670 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
23671 DAG.getIntPtrConstant(0, DL));
23672 SDValue Res;
23673 if (IsStrict) {
23674 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
23675 {Chain, In});
23676 Chain = Res.getValue(1);
23677 } else {
23678 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
23679 DAG.getTargetConstant(4, DL, MVT::i32));
23680 }
23681 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
23682 DAG.getIntPtrConstant(0, DL));
23683 if (IsStrict)
23684 return DAG.getMergeValues({Res, Chain}, DL);
23685 return Res;
23686 }
23687
23688 if (!SVT.isVector())
23689 return Op;
23690
23691 if (SVT.getVectorElementType() == MVT::f16) {
23692 assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23692, __extension__
__PRETTY_FUNCTION__))
;
23693 if (SVT == MVT::v2f16)
23694 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
23695 DAG.getUNDEF(MVT::v2f16));
23696 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
23697 DAG.getUNDEF(MVT::v4f16));
23698 if (IsStrict)
23699 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23700 {Op->getOperand(0), Res});
23701 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23702 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
23703 return Op;
23704 }
23705
23706 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23706, __extension__
__PRETTY_FUNCTION__))
;
23707
23708 SDValue Res =
23709 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
23710 if (IsStrict)
23711 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23712 {Op->getOperand(0), Res});
23713 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23714}
23715
23716SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
23717 bool IsStrict = Op->isStrictFPOpcode();
23718
23719 SDLoc DL(Op);
23720 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23721 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23722 MVT VT = Op.getSimpleValueType();
23723 MVT SVT = In.getSimpleValueType();
23724
23725 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
23726 return SDValue();
23727
23728 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
23729 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
23730 if (!Subtarget.getTargetTriple().isOSDarwin())
23731 return SDValue();
23732
23733 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
23734 TargetLowering::CallLoweringInfo CLI(DAG);
23735 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23736
23737 TargetLowering::ArgListTy Args;
23738 TargetLowering::ArgListEntry Entry;
23739 Entry.Node = In;
23740 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
23741 Entry.IsSExt = false;
23742 Entry.IsZExt = true;
23743 Args.push_back(Entry);
23744
23745 SDValue Callee = DAG.getExternalSymbol(
23746 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
23747 : RTLIB::FPROUND_F32_F16),
23748 getPointerTy(DAG.getDataLayout()));
23749 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23750 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
23751 std::move(Args));
23752
23753 SDValue Res;
23754 std::tie(Res, Chain) = LowerCallTo(CLI);
23755
23756 Res = DAG.getBitcast(MVT::f16, Res);
23757
23758 if (IsStrict)
23759 Res = DAG.getMergeValues({Res, Chain}, DL);
23760
23761 return Res;
23762 }
23763
23764 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
23765 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
23766 return SDValue();
23767
23768 if (VT.isVector())
23769 return Op;
23770
23771 SDValue Res;
23772 SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
23773 MVT::i32);
23774 if (IsStrict) {
23775 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
23776 DAG.getConstantFP(0, DL, MVT::v4f32), In,
23777 DAG.getIntPtrConstant(0, DL));
23778 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
23779 {Chain, Res, Rnd});
23780 Chain = Res.getValue(1);
23781 } else {
23782 // FIXME: Should we use zeros for upper elements for non-strict?
23783 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
23784 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
23785 }
23786
23787 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
23788 DAG.getIntPtrConstant(0, DL));
23789 Res = DAG.getBitcast(MVT::f16, Res);
23790
23791 if (IsStrict)
23792 return DAG.getMergeValues({Res, Chain}, DL);
23793
23794 return Res;
23795 }
23796
23797 return Op;
23798}
23799
23800static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
23801 bool IsStrict = Op->isStrictFPOpcode();
23802 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23803 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__
__PRETTY_FUNCTION__))
23804 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__
__PRETTY_FUNCTION__))
;
23805
23806 SDLoc dl(Op);
23807 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
23808 DAG.getConstant(0, dl, MVT::v8i16), Src,
23809 DAG.getIntPtrConstant(0, dl));
23810
23811 SDValue Chain;
23812 if (IsStrict) {
23813 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
23814 {Op.getOperand(0), Res});
23815 Chain = Res.getValue(1);
23816 } else {
23817 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
23818 }
23819
23820 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
23821 DAG.getIntPtrConstant(0, dl));
23822
23823 if (IsStrict)
23824 return DAG.getMergeValues({Res, Chain}, dl);
23825
23826 return Res;
23827}
23828
23829static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
23830 bool IsStrict = Op->isStrictFPOpcode();
23831 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23832 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__
__PRETTY_FUNCTION__))
23833 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__
__PRETTY_FUNCTION__))
;
23834
23835 SDLoc dl(Op);
23836 SDValue Res, Chain;
23837 if (IsStrict) {
23838 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
23839 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
23840 DAG.getIntPtrConstant(0, dl));
23841 Res = DAG.getNode(
23842 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
23843 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
23844 Chain = Res.getValue(1);
23845 } else {
23846 // FIXME: Should we use zeros for upper elements for non-strict?
23847 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
23848 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
23849 DAG.getTargetConstant(4, dl, MVT::i32));
23850 }
23851
23852 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
23853 DAG.getIntPtrConstant(0, dl));
23854
23855 if (IsStrict)
23856 return DAG.getMergeValues({Res, Chain}, dl);
23857
23858 return Res;
23859}
23860
23861SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
23862 SelectionDAG &DAG) const {
23863 SDLoc DL(Op);
23864 MakeLibCallOptions CallOptions;
23865 RTLIB::Libcall LC =
23866 RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
23867 SDValue Res =
23868 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
23869 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
23870 DAG.getBitcast(MVT::i32, Res));
23871}
23872
23873/// Depending on uarch and/or optimizing for size, we might prefer to use a
23874/// vector operation in place of the typical scalar operation.
23875static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
23876 const X86Subtarget &Subtarget) {
23877 // If both operands have other uses, this is probably not profitable.
23878 SDValue LHS = Op.getOperand(0);
23879 SDValue RHS = Op.getOperand(1);
23880 if (!LHS.hasOneUse() && !RHS.hasOneUse())
23881 return Op;
23882
23883 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
23884 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
23885 if (IsFP && !Subtarget.hasSSE3())
23886 return Op;
23887 if (!IsFP && !Subtarget.hasSSSE3())
23888 return Op;
23889
23890 // Extract from a common vector.
23891 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23892 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23893 LHS.getOperand(0) != RHS.getOperand(0) ||
23894 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
23895 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
23896 !shouldUseHorizontalOp(true, DAG, Subtarget))
23897 return Op;
23898
23899 // Allow commuted 'hadd' ops.
23900 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
23901 unsigned HOpcode;
23902 switch (Op.getOpcode()) {
23903 case ISD::ADD: HOpcode = X86ISD::HADD; break;
23904 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
23905 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
23906 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
23907 default:
23908 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23908)
;
23909 }
23910 unsigned LExtIndex = LHS.getConstantOperandVal(1);
23911 unsigned RExtIndex = RHS.getConstantOperandVal(1);
23912 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
23913 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
23914 std::swap(LExtIndex, RExtIndex);
23915
23916 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
23917 return Op;
23918
23919 SDValue X = LHS.getOperand(0);
23920 EVT VecVT = X.getValueType();
23921 unsigned BitWidth = VecVT.getSizeInBits();
23922 unsigned NumLanes = BitWidth / 128;
23923 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
23924 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__
__PRETTY_FUNCTION__))
23925 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__
__PRETTY_FUNCTION__))
;
23926
23927 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
23928 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
23929 SDLoc DL(Op);
23930 if (BitWidth == 256 || BitWidth == 512) {
23931 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
23932 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
23933 LExtIndex %= NumEltsPerLane;
23934 }
23935
23936 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
23937 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
23938 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
23939 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
23940 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
23941 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
23942 DAG.getIntPtrConstant(LExtIndex / 2, DL));
23943}
23944
23945/// Depending on uarch and/or optimizing for size, we might prefer to use a
23946/// vector operation in place of the typical scalar operation.
23947SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
23948 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__
__PRETTY_FUNCTION__))
23949 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__
__PRETTY_FUNCTION__))
;
23950 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
23951}
23952
23953/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
23954/// This mode isn't supported in hardware on X86. But as long as we aren't
23955/// compiling with trapping math, we can emulate this with
23956/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
23957static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
23958 SDValue N0 = Op.getOperand(0);
23959 SDLoc dl(Op);
23960 MVT VT = Op.getSimpleValueType();
23961
23962 // N0 += copysign(nextafter(0.5, 0.0), N0)
23963 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23964 bool Ignored;
23965 APFloat Point5Pred = APFloat(0.5f);
23966 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
23967 Point5Pred.next(/*nextDown*/true);
23968
23969 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
23970 DAG.getConstantFP(Point5Pred, dl, VT), N0);
23971 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
23972
23973 // Truncate the result to remove fraction.
23974 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
23975}
23976
23977/// The only differences between FABS and FNEG are the mask and the logic op.
23978/// FNEG also has a folding opportunity for FNEG(FABS(x)).
23979static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
23980 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__
__PRETTY_FUNCTION__))
23981 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__
__PRETTY_FUNCTION__))
;
23982
23983 bool IsFABS = (Op.getOpcode() == ISD::FABS);
23984
23985 // If this is a FABS and it has an FNEG user, bail out to fold the combination
23986 // into an FNABS. We'll lower the FABS after that if it is still in use.
23987 if (IsFABS)
23988 for (SDNode *User : Op->uses())
23989 if (User->getOpcode() == ISD::FNEG)
23990 return Op;
23991
23992 SDLoc dl(Op);
23993 MVT VT = Op.getSimpleValueType();
23994
23995 bool IsF128 = (VT == MVT::f128);
23996 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))
23997 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))
23998 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))
;
23999
24000 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
24001 // decide if we should generate a 16-byte constant mask when we only need 4 or
24002 // 8 bytes for the scalar case.
24003
24004 // There are no scalar bitwise logical SSE/AVX instructions, so we
24005 // generate a 16-byte vector constant and logic op even for the scalar case.
24006 // Using a 16-byte mask allows folding the load of the mask with
24007 // the logic op, so it can save (~4 bytes) on code size.
24008 bool IsFakeVector = !VT.isVector() && !IsF128;
24009 MVT LogicVT = VT;
24010 if (IsFakeVector)
24011 LogicVT = (VT == MVT::f64) ? MVT::v2f64
24012 : (VT == MVT::f32) ? MVT::v4f32
24013 : MVT::v8f16;
24014
24015 unsigned EltBits = VT.getScalarSizeInBits();
24016 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
24017 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
24018 APInt::getSignMask(EltBits);
24019 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
24020 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
24021
24022 SDValue Op0 = Op.getOperand(0);
24023 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
24024 unsigned LogicOp = IsFABS ? X86ISD::FAND :
24025 IsFNABS ? X86ISD::FOR :
24026 X86ISD::FXOR;
24027 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
24028
24029 if (VT.isVector() || IsF128)
24030 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
24031
24032 // For the scalar case extend to a 128-bit vector, perform the logic op,
24033 // and extract the scalar result back out.
24034 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
24035 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
24036 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
24037 DAG.getIntPtrConstant(0, dl));
24038}
24039
24040static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
24041 SDValue Mag = Op.getOperand(0);
24042 SDValue Sign = Op.getOperand(1);
24043 SDLoc dl(Op);
24044
24045 // If the sign operand is smaller, extend it first.
24046 MVT VT = Op.getSimpleValueType();
24047 if (Sign.getSimpleValueType().bitsLT(VT))
24048 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
24049
24050 // And if it is bigger, shrink it first.
24051 if (Sign.getSimpleValueType().bitsGT(VT))
24052 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
24053 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
24054
24055 // At this point the operands and the result should have the same
24056 // type, and that won't be f80 since that is not custom lowered.
24057 bool IsF128 = (VT == MVT::f128);
24058 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))
24059 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))
24060 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))
;
24061
24062 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
24063
24064 // Perform all scalar logic operations as 16-byte vectors because there are no
24065 // scalar FP logic instructions in SSE.
24066 // TODO: This isn't necessary. If we used scalar types, we might avoid some
24067 // unnecessary splats, but we might miss load folding opportunities. Should
24068 // this decision be based on OptimizeForSize?
24069 bool IsFakeVector = !VT.isVector() && !IsF128;
24070 MVT LogicVT = VT;
24071 if (IsFakeVector)
24072 LogicVT = (VT == MVT::f64) ? MVT::v2f64
24073 : (VT == MVT::f32) ? MVT::v4f32
24074 : MVT::v8f16;
24075
24076 // The mask constants are automatically splatted for vector types.
24077 unsigned EltSizeInBits = VT.getScalarSizeInBits();
24078 SDValue SignMask = DAG.getConstantFP(
24079 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
24080 SDValue MagMask = DAG.getConstantFP(
24081 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
24082
24083 // First, clear all bits but the sign bit from the second operand (sign).
24084 if (IsFakeVector)
24085 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
24086 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
24087
24088 // Next, clear the sign bit from the first operand (magnitude).
24089 // TODO: If we had general constant folding for FP logic ops, this check
24090 // wouldn't be necessary.
24091 SDValue MagBits;
24092 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
24093 APFloat APF = Op0CN->getValueAPF();
24094 APF.clearSign();
24095 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
24096 } else {
24097 // If the magnitude operand wasn't a constant, we need to AND out the sign.
24098 if (IsFakeVector)
24099 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
24100 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
24101 }
24102
24103 // OR the magnitude value with the sign bit.
24104 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
24105 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
24106 DAG.getIntPtrConstant(0, dl));
24107}
24108
24109static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
24110 SDValue N0 = Op.getOperand(0);
24111 SDLoc dl(Op);
24112 MVT VT = Op.getSimpleValueType();
24113
24114 MVT OpVT = N0.getSimpleValueType();
24115 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__
__PRETTY_FUNCTION__))
24116 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__
__PRETTY_FUNCTION__))
;
24117
24118 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
24119 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
24120 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
24121 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
24122 Res = DAG.getZExtOrTrunc(Res, dl, VT);
24123 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
24124 return Res;
24125}
24126
24127/// Helper for attempting to create a X86ISD::BT node.
24128static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
24129 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
24130 // instruction. Since the shift amount is in-range-or-undefined, we know
24131 // that doing a bittest on the i32 value is ok. We extend to i32 because
24132 // the encoding for the i16 version is larger than the i32 version.
24133 // Also promote i16 to i32 for performance / code size reason.
24134 if (Src.getValueType().getScalarSizeInBits() < 32)
24135 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
24136
24137 // No legal type found, give up.
24138 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
24139 return SDValue();
24140
24141 // See if we can use the 32-bit instruction instead of the 64-bit one for a
24142 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
24143 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
24144 // known to be zero.
24145 if (Src.getValueType() == MVT::i64 &&
24146 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
24147 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
24148
24149 // If the operand types disagree, extend the shift amount to match. Since
24150 // BT ignores high bits (like shifts) we can use anyextend.
24151 if (Src.getValueType() != BitNo.getValueType()) {
24152 // Peek through a mask/modulo operation.
24153 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
24154 // we probably need a better IsDesirableToPromoteOp to handle this as well.
24155 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
24156 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
24157 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24158 BitNo.getOperand(0)),
24159 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24160 BitNo.getOperand(1)));
24161 else
24162 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
24163 }
24164
24165 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
24166}
24167
24168/// Helper for creating a X86ISD::SETCC node.
24169static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
24170 SelectionDAG &DAG) {
24171 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
24172 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
24173}
24174
24175/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
24176/// recognizable memcmp expansion.
24177static bool isOrXorXorTree(SDValue X, bool Root = true) {
24178 if (X.getOpcode() == ISD::OR)
24179 return isOrXorXorTree(X.getOperand(0), false) &&
24180 isOrXorXorTree(X.getOperand(1), false);
24181 if (Root)
24182 return false;
24183 return X.getOpcode() == ISD::XOR;
24184}
24185
24186/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
24187/// expansion.
24188template <typename F>
24189static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
24190 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
24191 SDValue Op0 = X.getOperand(0);
24192 SDValue Op1 = X.getOperand(1);
24193 if (X.getOpcode() == ISD::OR) {
24194 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24195 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24196 if (VecVT != CmpVT)
24197 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
24198 if (HasPT)
24199 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
24200 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
24201 }
24202 if (X.getOpcode() == ISD::XOR) {
24203 SDValue A = SToV(Op0);
24204 SDValue B = SToV(Op1);
24205 if (VecVT != CmpVT)
24206 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
24207 if (HasPT)
24208 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
24209 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
24210 }
24211 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24211)
;
24212}
24213
24214/// Try to map a 128-bit or larger integer comparison to vector instructions
24215/// before type legalization splits it up into chunks.
24216static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
24217 ISD::CondCode CC,
24218 const SDLoc &DL,
24219 SelectionDAG &DAG,
24220 const X86Subtarget &Subtarget) {
24221 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24221, __extension__
__PRETTY_FUNCTION__))
;
24222
24223 // We're looking for an oversized integer equality comparison.
24224 EVT OpVT = X.getValueType();
24225 unsigned OpSize = OpVT.getSizeInBits();
24226 if (!OpVT.isScalarInteger() || OpSize < 128)
24227 return SDValue();
24228
24229 // Ignore a comparison with zero because that gets special treatment in
24230 // EmitTest(). But make an exception for the special case of a pair of
24231 // logically-combined vector-sized operands compared to zero. This pattern may
24232 // be generated by the memcmp expansion pass with oversized integer compares
24233 // (see PR33325).
24234 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
24235 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
24236 return SDValue();
24237
24238 // Don't perform this combine if constructing the vector will be expensive.
24239 auto IsVectorBitCastCheap = [](SDValue X) {
24240 X = peekThroughBitcasts(X);
24241 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
24242 X.getOpcode() == ISD::LOAD;
24243 };
24244 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
24245 !IsOrXorXorTreeCCZero)
24246 return SDValue();
24247
24248 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
24249 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
24250 // Otherwise use PCMPEQ (plus AND) and mask testing.
24251 bool NoImplicitFloatOps =
24252 DAG.getMachineFunction().getFunction().hasFnAttribute(
24253 Attribute::NoImplicitFloat);
24254 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
24255 ((OpSize == 128 && Subtarget.hasSSE2()) ||
24256 (OpSize == 256 && Subtarget.hasAVX()) ||
24257 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
24258 bool HasPT = Subtarget.hasSSE41();
24259
24260 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
24261 // vector registers are essentially free. (Technically, widening registers
24262 // prevents load folding, but the tradeoff is worth it.)
24263 bool PreferKOT = Subtarget.preferMaskRegisters();
24264 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
24265
24266 EVT VecVT = MVT::v16i8;
24267 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
24268 if (OpSize == 256) {
24269 VecVT = MVT::v32i8;
24270 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
24271 }
24272 EVT CastVT = VecVT;
24273 bool NeedsAVX512FCast = false;
24274 if (OpSize == 512 || NeedZExt) {
24275 if (Subtarget.hasBWI()) {
24276 VecVT = MVT::v64i8;
24277 CmpVT = MVT::v64i1;
24278 if (OpSize == 512)
24279 CastVT = VecVT;
24280 } else {
24281 VecVT = MVT::v16i32;
24282 CmpVT = MVT::v16i1;
24283 CastVT = OpSize == 512 ? VecVT
24284 : OpSize == 256 ? MVT::v8i32
24285 : MVT::v4i32;
24286 NeedsAVX512FCast = true;
24287 }
24288 }
24289
24290 auto ScalarToVector = [&](SDValue X) -> SDValue {
24291 bool TmpZext = false;
24292 EVT TmpCastVT = CastVT;
24293 if (X.getOpcode() == ISD::ZERO_EXTEND) {
24294 SDValue OrigX = X.getOperand(0);
24295 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
24296 if (OrigSize < OpSize) {
24297 if (OrigSize == 128) {
24298 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
24299 X = OrigX;
24300 TmpZext = true;
24301 } else if (OrigSize == 256) {
24302 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
24303 X = OrigX;
24304 TmpZext = true;
24305 }
24306 }
24307 }
24308 X = DAG.getBitcast(TmpCastVT, X);
24309 if (!NeedZExt && !TmpZext)
24310 return X;
24311 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
24312 DAG.getConstant(0, DL, VecVT), X,
24313 DAG.getVectorIdxConstant(0, DL));
24314 };
24315
24316 SDValue Cmp;
24317 if (IsOrXorXorTreeCCZero) {
24318 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
24319 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
24320 // Use 2 vector equality compares and 'and' the results before doing a
24321 // MOVMSK.
24322 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
24323 } else {
24324 SDValue VecX = ScalarToVector(X);
24325 SDValue VecY = ScalarToVector(Y);
24326 if (VecVT != CmpVT) {
24327 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
24328 } else if (HasPT) {
24329 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
24330 } else {
24331 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
24332 }
24333 }
24334 // AVX512 should emit a setcc that will lower to kortest.
24335 if (VecVT != CmpVT) {
24336 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
24337 : CmpVT == MVT::v32i1 ? MVT::i32
24338 : MVT::i16;
24339 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
24340 DAG.getConstant(0, DL, KRegVT), CC);
24341 }
24342 if (HasPT) {
24343 SDValue BCCmp =
24344 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
24345 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
24346 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24347 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
24348 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
24349 }
24350 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
24351 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
24352 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
24353 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__
__PRETTY_FUNCTION__))
24354 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__
__PRETTY_FUNCTION__))
;
24355 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
24356 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
24357 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
24358 }
24359
24360 return SDValue();
24361}
24362
24363/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
24364/// style scalarized (associative) reduction patterns. Partial reductions
24365/// are supported when the pointer SrcMask is non-null.
24366/// TODO - move this to SelectionDAG?
24367static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
24368 SmallVectorImpl<SDValue> &SrcOps,
24369 SmallVectorImpl<APInt> *SrcMask = nullptr) {
24370 SmallVector<SDValue, 8> Opnds;
24371 DenseMap<SDValue, APInt> SrcOpMap;
24372 EVT VT = MVT::Other;
24373
24374 // Recognize a special case where a vector is casted into wide integer to
24375 // test all 0s.
24376 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__
__PRETTY_FUNCTION__))
24377 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__
__PRETTY_FUNCTION__))
;
24378 Opnds.push_back(Op.getOperand(0));
24379 Opnds.push_back(Op.getOperand(1));
24380
24381 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
24382 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
24383 // BFS traverse all BinOp operands.
24384 if (I->getOpcode() == unsigned(BinOp)) {
24385 Opnds.push_back(I->getOperand(0));
24386 Opnds.push_back(I->getOperand(1));
24387 // Re-evaluate the number of nodes to be traversed.
24388 e += 2; // 2 more nodes (LHS and RHS) are pushed.
24389 continue;
24390 }
24391
24392 // Quit if a non-EXTRACT_VECTOR_ELT
24393 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24394 return false;
24395
24396 // Quit if without a constant index.
24397 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
24398 if (!Idx)
24399 return false;
24400
24401 SDValue Src = I->getOperand(0);
24402 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
24403 if (M == SrcOpMap.end()) {
24404 VT = Src.getValueType();
24405 // Quit if not the same type.
24406 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
24407 return false;
24408 unsigned NumElts = VT.getVectorNumElements();
24409 APInt EltCount = APInt::getZero(NumElts);
24410 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
24411 SrcOps.push_back(Src);
24412 }
24413
24414 // Quit if element already used.
24415 unsigned CIdx = Idx->getZExtValue();
24416 if (M->second[CIdx])
24417 return false;
24418 M->second.setBit(CIdx);
24419 }
24420
24421 if (SrcMask) {
24422 // Collect the source partial masks.
24423 for (SDValue &SrcOp : SrcOps)
24424 SrcMask->push_back(SrcOpMap[SrcOp]);
24425 } else {
24426 // Quit if not all elements are used.
24427 for (const auto &I : SrcOpMap)
24428 if (!I.second.isAllOnes())
24429 return false;
24430 }
24431
24432 return true;
24433}
24434
24435// Helper function for comparing all bits of two vectors.
24436static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
24437 ISD::CondCode CC, const APInt &OriginalMask,
24438 const X86Subtarget &Subtarget,
24439 SelectionDAG &DAG, X86::CondCode &X86CC) {
24440 EVT VT = LHS.getValueType();
24441 unsigned ScalarSize = VT.getScalarSizeInBits();
24442 if (OriginalMask.getBitWidth() != ScalarSize) {
24443 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24443, __extension__
__PRETTY_FUNCTION__))
;
24444 return SDValue();
24445 }
24446
24447 // Quit if not convertable to legal scalar or 128/256-bit vector.
24448 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24449 return SDValue();
24450
24451 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
24452 if (VT.isFloatingPoint())
24453 return SDValue();
24454
24455 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24455, __extension__
__PRETTY_FUNCTION__))
;
24456 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
24457
24458 APInt Mask = OriginalMask;
24459
24460 auto MaskBits = [&](SDValue Src) {
24461 if (Mask.isAllOnes())
24462 return Src;
24463 EVT SrcVT = Src.getValueType();
24464 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
24465 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
24466 };
24467
24468 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
24469 if (VT.getSizeInBits() < 128) {
24470 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
24471 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
24472 if (IntVT != MVT::i64)
24473 return SDValue();
24474 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
24475 MVT::i32, MVT::i32);
24476 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
24477 MVT::i32, MVT::i32);
24478 SDValue Lo =
24479 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
24480 SDValue Hi =
24481 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
24482 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24483 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
24484 DAG.getConstant(0, DL, MVT::i32));
24485 }
24486 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24487 DAG.getBitcast(IntVT, MaskBits(LHS)),
24488 DAG.getBitcast(IntVT, MaskBits(RHS)));
24489 }
24490
24491 // Without PTEST, a masked v2i64 or-reduction is not faster than
24492 // scalarization.
24493 bool UseKORTEST = Subtarget.useAVX512Regs();
24494 bool UsePTEST = Subtarget.hasSSE41();
24495 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
24496 return SDValue();
24497
24498 // Split down to 128/256/512-bit vector.
24499 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
24500
24501 // If the input vector has vector elements wider than the target test size,
24502 // then cast to <X x i64> so it will safely split.
24503 if (ScalarSize > TestSize) {
24504 if (!Mask.isAllOnes())
24505 return SDValue();
24506 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
24507 LHS = DAG.getBitcast(VT, LHS);
24508 RHS = DAG.getBitcast(VT, RHS);
24509 Mask = APInt::getAllOnes(64);
24510 }
24511
24512 if (VT.getSizeInBits() > TestSize) {
24513 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
24514 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
24515 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
24516 while (VT.getSizeInBits() > TestSize) {
24517 auto Split = DAG.SplitVector(LHS, DL);
24518 VT = Split.first.getValueType();
24519 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24520 }
24521 RHS = DAG.getAllOnesConstant(DL, VT);
24522 } else if (!UsePTEST && !KnownRHS.isZero()) {
24523 // MOVMSK Special Case:
24524 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
24525 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
24526 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
24527 LHS = DAG.getBitcast(VT, MaskBits(LHS));
24528 RHS = DAG.getBitcast(VT, MaskBits(RHS));
24529 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
24530 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
24531 V = DAG.getSExtOrTrunc(V, DL, VT);
24532 while (VT.getSizeInBits() > TestSize) {
24533 auto Split = DAG.SplitVector(V, DL);
24534 VT = Split.first.getValueType();
24535 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24536 }
24537 V = DAG.getNOT(DL, V, VT);
24538 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24539 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24540 DAG.getConstant(0, DL, MVT::i32));
24541 } else {
24542 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
24543 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24544 while (VT.getSizeInBits() > TestSize) {
24545 auto Split = DAG.SplitVector(V, DL);
24546 VT = Split.first.getValueType();
24547 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
24548 }
24549 LHS = V;
24550 RHS = DAG.getConstant(0, DL, VT);
24551 }
24552 }
24553
24554 if (UseKORTEST && VT.is512BitVector()) {
24555 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
24556 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
24557 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24558 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24559 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
24560 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
24561 }
24562
24563 if (UsePTEST) {
24564 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
24565 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24566 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24567 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
24568 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
24569 }
24570
24571 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 &&
"Failure to split to 128-bits") ? void (0) : __assert_fail (
"VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24571, __extension__
__PRETTY_FUNCTION__))
;
24572 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
24573 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
24574 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
24575 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
24576 V = DAG.getNOT(DL, V, MaskVT);
24577 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24578 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24579 DAG.getConstant(0, DL, MVT::i32));
24580}
24581
24582// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
24583// to CMP(MOVMSK(PCMPEQB(X,Y))).
24584static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
24585 ISD::CondCode CC, const SDLoc &DL,
24586 const X86Subtarget &Subtarget,
24587 SelectionDAG &DAG,
24588 X86::CondCode &X86CC) {
24589 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24589, __extension__
__PRETTY_FUNCTION__))
;
24590
24591 bool CmpNull = isNullConstant(RHS);
24592 bool CmpAllOnes = isAllOnesConstant(RHS);
24593 if (!CmpNull && !CmpAllOnes)
24594 return SDValue();
24595
24596 SDValue Op = LHS;
24597 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
24598 return SDValue();
24599
24600 // Check whether we're masking/truncating an OR-reduction result, in which
24601 // case track the masked bits.
24602 // TODO: Add CmpAllOnes support.
24603 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
24604 if (CmpNull) {
24605 switch (Op.getOpcode()) {
24606 case ISD::TRUNCATE: {
24607 SDValue Src = Op.getOperand(0);
24608 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
24609 Op.getScalarValueSizeInBits());
24610 Op = Src;
24611 break;
24612 }
24613 case ISD::AND: {
24614 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
24615 Mask = Cst->getAPIntValue();
24616 Op = Op.getOperand(0);
24617 }
24618 break;
24619 }
24620 }
24621 }
24622
24623 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
24624
24625 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
24626 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
24627 SmallVector<SDValue, 8> VecIns;
24628 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
24629 EVT VT = VecIns[0].getValueType();
24630 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))
24631 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))
24632 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))
;
24633
24634 // Quit if not splittable to scalar/128/256/512-bit vector.
24635 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24636 return SDValue();
24637
24638 // If more than one full vector is evaluated, AND/OR them first before
24639 // PTEST.
24640 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
24641 Slot += 2, e += 1) {
24642 // Each iteration will AND/OR 2 nodes and append the result until there is
24643 // only 1 node left, i.e. the final value of all vectors.
24644 SDValue LHS = VecIns[Slot];
24645 SDValue RHS = VecIns[Slot + 1];
24646 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
24647 }
24648
24649 return LowerVectorAllEqual(DL, VecIns.back(),
24650 CmpNull ? DAG.getConstant(0, DL, VT)
24651 : DAG.getAllOnesConstant(DL, VT),
24652 CC, Mask, Subtarget, DAG, X86CC);
24653 }
24654
24655 // Match icmp(reduce_or(X),0) anyof reduction patterns.
24656 // Match icmp(reduce_and(X),-1) allof reduction patterns.
24657 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24658 ISD::NodeType BinOp;
24659 if (SDValue Match =
24660 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
24661 EVT MatchVT = Match.getValueType();
24662 return LowerVectorAllEqual(DL, Match,
24663 CmpNull ? DAG.getConstant(0, DL, MatchVT)
24664 : DAG.getAllOnesConstant(DL, MatchVT),
24665 CC, Mask, Subtarget, DAG, X86CC);
24666 }
24667 }
24668
24669 if (Mask.isAllOnes()) {
24670 assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__
__PRETTY_FUNCTION__))
24671 "Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__
__PRETTY_FUNCTION__))
;
24672 SDValue Src = peekThroughBitcasts(Op);
24673 if (Src.getValueType().isFixedLengthVector() &&
24674 Src.getValueType().getScalarType() == MVT::i1) {
24675 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
24676 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
24677 if (Src.getOpcode() == ISD::SETCC) {
24678 SDValue LHS = Src.getOperand(0);
24679 SDValue RHS = Src.getOperand(1);
24680 EVT LHSVT = LHS.getValueType();
24681 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
24682 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
24683 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
24684 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
24685 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
24686 X86CC);
24687 }
24688 }
24689 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
24690 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
24691 // Peek through truncation, mask the LSB and compare against zero/LSB.
24692 if (Src.getOpcode() == ISD::TRUNCATE) {
24693 SDValue Inner = Src.getOperand(0);
24694 EVT InnerVT = Inner.getValueType();
24695 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
24696 unsigned BW = InnerVT.getScalarSizeInBits();
24697 APInt SrcMask = APInt(BW, 1);
24698 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
24699 return LowerVectorAllEqual(DL, Inner,
24700 DAG.getConstant(Cmp, DL, InnerVT), CC,
24701 SrcMask, Subtarget, DAG, X86CC);
24702 }
24703 }
24704 }
24705 }
24706
24707 return SDValue();
24708}
24709
24710/// return true if \c Op has a use that doesn't just read flags.
24711static bool hasNonFlagsUse(SDValue Op) {
24712 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
24713 ++UI) {
24714 SDNode *User = *UI;
24715 unsigned UOpNo = UI.getOperandNo();
24716 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
24717 // Look pass truncate.
24718 UOpNo = User->use_begin().getOperandNo();
24719 User = *User->use_begin();
24720 }
24721
24722 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
24723 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
24724 return true;
24725 }
24726 return false;
24727}
24728
24729// Transform to an x86-specific ALU node with flags if there is a chance of
24730// using an RMW op or only the flags are used. Otherwise, leave
24731// the node alone and emit a 'cmp' or 'test' instruction.
24732static bool isProfitableToUseFlagOp(SDValue Op) {
24733 for (SDNode *U : Op->uses())
24734 if (U->getOpcode() != ISD::CopyToReg &&
24735 U->getOpcode() != ISD::SETCC &&
24736 U->getOpcode() != ISD::STORE)
24737 return false;
24738
24739 return true;
24740}
24741
24742/// Emit nodes that will be selected as "test Op0,Op0", or something
24743/// equivalent.
24744static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
24745 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
24746 // CF and OF aren't always set the way we want. Determine which
24747 // of these we need.
24748 bool NeedCF = false;
24749 bool NeedOF = false;
24750 switch (X86CC) {
24751 default: break;
24752 case X86::COND_A: case X86::COND_AE:
24753 case X86::COND_B: case X86::COND_BE:
24754 NeedCF = true;
24755 break;
24756 case X86::COND_G: case X86::COND_GE:
24757 case X86::COND_L: case X86::COND_LE:
24758 case X86::COND_O: case X86::COND_NO: {
24759 // Check if we really need to set the
24760 // Overflow flag. If NoSignedWrap is present
24761 // that is not actually needed.
24762 switch (Op->getOpcode()) {
24763 case ISD::ADD:
24764 case ISD::SUB:
24765 case ISD::MUL:
24766 case ISD::SHL:
24767 if (Op.getNode()->getFlags().hasNoSignedWrap())
24768 break;
24769 [[fallthrough]];
24770 default:
24771 NeedOF = true;
24772 break;
24773 }
24774 break;
24775 }
24776 }
24777 // See if we can use the EFLAGS value from the operand instead of
24778 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
24779 // we prove that the arithmetic won't overflow, we can't use OF or CF.
24780 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
24781 // Emit a CMP with 0, which is the TEST pattern.
24782 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24783 DAG.getConstant(0, dl, Op.getValueType()));
24784 }
24785 unsigned Opcode = 0;
24786 unsigned NumOperands = 0;
24787
24788 SDValue ArithOp = Op;
24789
24790 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
24791 // which may be the result of a CAST. We use the variable 'Op', which is the
24792 // non-casted variable when we check for possible users.
24793 switch (ArithOp.getOpcode()) {
24794 case ISD::AND:
24795 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
24796 // because a TEST instruction will be better.
24797 if (!hasNonFlagsUse(Op))
24798 break;
24799
24800 [[fallthrough]];
24801 case ISD::ADD:
24802 case ISD::SUB:
24803 case ISD::OR:
24804 case ISD::XOR:
24805 if (!isProfitableToUseFlagOp(Op))
24806 break;
24807
24808 // Otherwise use a regular EFLAGS-setting instruction.
24809 switch (ArithOp.getOpcode()) {
24810 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24810)
;
24811 case ISD::ADD: Opcode = X86ISD::ADD; break;
24812 case ISD::SUB: Opcode = X86ISD::SUB; break;
24813 case ISD::XOR: Opcode = X86ISD::XOR; break;
24814 case ISD::AND: Opcode = X86ISD::AND; break;
24815 case ISD::OR: Opcode = X86ISD::OR; break;
24816 }
24817
24818 NumOperands = 2;
24819 break;
24820 case X86ISD::ADD:
24821 case X86ISD::SUB:
24822 case X86ISD::OR:
24823 case X86ISD::XOR:
24824 case X86ISD::AND:
24825 return SDValue(Op.getNode(), 1);
24826 case ISD::SSUBO:
24827 case ISD::USUBO: {
24828 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
24829 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24830 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
24831 Op->getOperand(1)).getValue(1);
24832 }
24833 default:
24834 break;
24835 }
24836
24837 if (Opcode == 0) {
24838 // Emit a CMP with 0, which is the TEST pattern.
24839 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24840 DAG.getConstant(0, dl, Op.getValueType()));
24841 }
24842 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24843 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
24844
24845 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
24846 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
24847 return SDValue(New.getNode(), 1);
24848}
24849
24850/// Emit nodes that will be selected as "cmp Op0,Op1", or something
24851/// equivalent.
24852static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
24853 const SDLoc &dl, SelectionDAG &DAG,
24854 const X86Subtarget &Subtarget) {
24855 if (isNullConstant(Op1))
24856 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
24857
24858 EVT CmpVT = Op0.getValueType();
24859
24860 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__
__PRETTY_FUNCTION__))
24861 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__
__PRETTY_FUNCTION__))
;
24862
24863 // Only promote the compare up to I32 if it is a 16 bit operation
24864 // with an immediate. 16 bit immediates are to be avoided.
24865 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
24866 !DAG.getMachineFunction().getFunction().hasMinSize()) {
24867 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
24868 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
24869 // Don't do this if the immediate can fit in 8-bits.
24870 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
24871 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
24872 unsigned ExtendOp =
24873 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24874 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
24875 // For equality comparisons try to use SIGN_EXTEND if the input was
24876 // truncate from something with enough sign bits.
24877 if (Op0.getOpcode() == ISD::TRUNCATE) {
24878 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
24879 ExtendOp = ISD::SIGN_EXTEND;
24880 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
24881 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
24882 ExtendOp = ISD::SIGN_EXTEND;
24883 }
24884 }
24885
24886 CmpVT = MVT::i32;
24887 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
24888 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
24889 }
24890 }
24891
24892 // Try to shrink i64 compares if the input has enough zero bits.
24893 // FIXME: Do this for non-constant compares for constant on LHS?
24894 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
24895 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
24896 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
24897 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
24898 CmpVT = MVT::i32;
24899 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
24900 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
24901 }
24902
24903 // 0-x == y --> x+y == 0
24904 // 0-x != y --> x+y != 0
24905 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
24906 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24907 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24908 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
24909 return Add.getValue(1);
24910 }
24911
24912 // x == 0-y --> x+y == 0
24913 // x != 0-y --> x+y != 0
24914 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
24915 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24916 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24917 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
24918 return Add.getValue(1);
24919 }
24920
24921 // Use SUB instead of CMP to enable CSE between SUB and CMP.
24922 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24923 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
24924 return Sub.getValue(1);
24925}
24926
24927/// Check if replacement of SQRT with RSQRT should be disabled.
24928bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
24929 EVT VT = Op.getValueType();
24930
24931 // We don't need to replace SQRT with RSQRT for half type.
24932 if (VT.getScalarType() == MVT::f16)
24933 return true;
24934
24935 // We never want to use both SQRT and RSQRT instructions for the same input.
24936 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
24937 return false;
24938
24939 if (VT.isVector())
24940 return Subtarget.hasFastVectorFSQRT();
24941 return Subtarget.hasFastScalarFSQRT();
24942}
24943
24944/// The minimum architected relative accuracy is 2^-12. We need one
24945/// Newton-Raphson step to have a good float result (24 bits of precision).
24946SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
24947 SelectionDAG &DAG, int Enabled,
24948 int &RefinementSteps,
24949 bool &UseOneConstNR,
24950 bool Reciprocal) const {
24951 SDLoc DL(Op);
24952 EVT VT = Op.getValueType();
24953
24954 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
24955 // It is likely not profitable to do this for f64 because a double-precision
24956 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
24957 // instructions: convert to single, rsqrtss, convert back to double, refine
24958 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
24959 // along with FMA, this could be a throughput win.
24960 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
24961 // after legalize types.
24962 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24963 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
24964 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
24965 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24966 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24967 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24968 RefinementSteps = 1;
24969
24970 UseOneConstNR = false;
24971 // There is no FSQRT for 512-bits, but there is RSQRT14.
24972 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
24973 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
24974 if (RefinementSteps == 0 && !Reciprocal)
24975 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
24976 return Estimate;
24977 }
24978
24979 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24980 Subtarget.hasFP16()) {
24981 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24981, __extension__
__PRETTY_FUNCTION__))
;
24982 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24983 RefinementSteps = 0;
24984
24985 if (VT == MVT::f16) {
24986 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24987 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24988 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24989 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
24990 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24991 }
24992
24993 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
24994 }
24995 return SDValue();
24996}
24997
24998/// The minimum architected relative accuracy is 2^-12. We need one
24999/// Newton-Raphson step to have a good float result (24 bits of precision).
25000SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
25001 int Enabled,
25002 int &RefinementSteps) const {
25003 SDLoc DL(Op);
25004 EVT VT = Op.getValueType();
25005
25006 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
25007 // It is likely not profitable to do this for f64 because a double-precision
25008 // reciprocal estimate with refinement on x86 prior to FMA requires
25009 // 15 instructions: convert to single, rcpss, convert back to double, refine
25010 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
25011 // along with FMA, this could be a throughput win.
25012
25013 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
25014 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
25015 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
25016 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
25017 // Enable estimate codegen with 1 refinement step for vector division.
25018 // Scalar division estimates are disabled because they break too much
25019 // real-world code. These defaults are intended to match GCC behavior.
25020 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
25021 return SDValue();
25022
25023 if (RefinementSteps == ReciprocalEstimate::Unspecified)
25024 RefinementSteps = 1;
25025
25026 // There is no FSQRT for 512-bits, but there is RCP14.
25027 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
25028 return DAG.getNode(Opcode, DL, VT, Op);
25029 }
25030
25031 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
25032 Subtarget.hasFP16()) {
25033 if (RefinementSteps == ReciprocalEstimate::Unspecified)
25034 RefinementSteps = 0;
25035
25036 if (VT == MVT::f16) {
25037 SDValue Zero = DAG.getIntPtrConstant(0, DL);
25038 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
25039 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
25040 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
25041 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
25042 }
25043
25044 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
25045 }
25046 return SDValue();
25047}
25048
25049/// If we have at least two divisions that use the same divisor, convert to
25050/// multiplication by a reciprocal. This may need to be adjusted for a given
25051/// CPU if a division's cost is not at least twice the cost of a multiplication.
25052/// This is because we still need one division to calculate the reciprocal and
25053/// then we need two multiplies by that reciprocal as replacements for the
25054/// original divisions.
25055unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
25056 return 2;
25057}
25058
25059SDValue
25060X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
25061 SelectionDAG &DAG,
25062 SmallVectorImpl<SDNode *> &Created) const {
25063 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
25064 if (isIntDivCheap(N->getValueType(0), Attr))
25065 return SDValue(N,0); // Lower SDIV as SDIV
25066
25067 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__
__PRETTY_FUNCTION__))
25068 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__
__PRETTY_FUNCTION__))
;
25069
25070 // Only perform this transform if CMOV is supported otherwise the select
25071 // below will become a branch.
25072 if (!Subtarget.canUseCMOV())
25073 return SDValue();
25074
25075 // fold (sdiv X, pow2)
25076 EVT VT = N->getValueType(0);
25077 // FIXME: Support i8.
25078 if (VT != MVT::i16 && VT != MVT::i32 &&
25079 !(Subtarget.is64Bit() && VT == MVT::i64))
25080 return SDValue();
25081
25082 unsigned Lg2 = Divisor.countr_zero();
25083
25084 // If the divisor is 2 or -2, the default expansion is better.
25085 if (Lg2 == 1)
25086 return SDValue();
25087
25088 SDLoc DL(N);
25089 SDValue N0 = N->getOperand(0);
25090 SDValue Zero = DAG.getConstant(0, DL, VT);
25091 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
25092 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
25093
25094 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
25095 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
25096 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
25097 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
25098
25099 Created.push_back(Cmp.getNode());
25100 Created.push_back(Add.getNode());
25101 Created.push_back(CMov.getNode());
25102
25103 // Divide by pow2.
25104 SDValue SRA =
25105 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
25106
25107 // If we're dividing by a positive value, we're done. Otherwise, we must
25108 // negate the result.
25109 if (Divisor.isNonNegative())
25110 return SRA;
25111
25112 Created.push_back(SRA.getNode());
25113 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
25114}
25115
25116/// Result of 'and' is compared against zero. Change to a BT node if possible.
25117/// Returns the BT node and the condition code needed to use it.
25118static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
25119 SelectionDAG &DAG, X86::CondCode &X86CC) {
25120 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25120, __extension__
__PRETTY_FUNCTION__))
;
25121 SDValue Op0 = And.getOperand(0);
25122 SDValue Op1 = And.getOperand(1);
25123 if (Op0.getOpcode() == ISD::TRUNCATE)
25124 Op0 = Op0.getOperand(0);
25125 if (Op1.getOpcode() == ISD::TRUNCATE)
25126 Op1 = Op1.getOperand(0);
25127
25128 SDValue Src, BitNo;
25129 if (Op1.getOpcode() == ISD::SHL)
25130 std::swap(Op0, Op1);
25131 if (Op0.getOpcode() == ISD::SHL) {
25132 if (isOneConstant(Op0.getOperand(0))) {
25133 // If we looked past a truncate, check that it's only truncating away
25134 // known zeros.
25135 unsigned BitWidth = Op0.getValueSizeInBits();
25136 unsigned AndBitWidth = And.getValueSizeInBits();
25137 if (BitWidth > AndBitWidth) {
25138 KnownBits Known = DAG.computeKnownBits(Op0);
25139 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
25140 return SDValue();
25141 }
25142 Src = Op1;
25143 BitNo = Op0.getOperand(1);
25144 }
25145 } else if (Op1.getOpcode() == ISD::Constant) {
25146 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
25147 uint64_t AndRHSVal = AndRHS->getZExtValue();
25148 SDValue AndLHS = Op0;
25149
25150 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
25151 Src = AndLHS.getOperand(0);
25152 BitNo = AndLHS.getOperand(1);
25153 } else {
25154 // Use BT if the immediate can't be encoded in a TEST instruction or we
25155 // are optimizing for size and the immedaite won't fit in a byte.
25156 bool OptForSize = DAG.shouldOptForSize();
25157 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
25158 isPowerOf2_64(AndRHSVal)) {
25159 Src = AndLHS;
25160 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
25161 Src.getValueType());
25162 }
25163 }
25164 }
25165
25166 // No patterns found, give up.
25167 if (!Src.getNode())
25168 return SDValue();
25169
25170 // Remove any bit flip.
25171 if (isBitwiseNot(Src)) {
25172 Src = Src.getOperand(0);
25173 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
25174 }
25175
25176 // Attempt to create the X86ISD::BT node.
25177 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
25178 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25179 return BT;
25180 }
25181
25182 return SDValue();
25183}
25184
25185// Check if pre-AVX condcode can be performed by a single FCMP op.
25186static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
25187 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
25188}
25189
25190/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
25191/// CMPs.
25192static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
25193 SDValue &Op1, bool &IsAlwaysSignaling) {
25194 unsigned SSECC;
25195 bool Swap = false;
25196
25197 // SSE Condition code mapping:
25198 // 0 - EQ
25199 // 1 - LT
25200 // 2 - LE
25201 // 3 - UNORD
25202 // 4 - NEQ
25203 // 5 - NLT
25204 // 6 - NLE
25205 // 7 - ORD
25206 switch (SetCCOpcode) {
25207 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25207)
;
25208 case ISD::SETOEQ:
25209 case ISD::SETEQ: SSECC = 0; break;
25210 case ISD::SETOGT:
25211 case ISD::SETGT: Swap = true; [[fallthrough]];
25212 case ISD::SETLT:
25213 case ISD::SETOLT: SSECC = 1; break;
25214 case ISD::SETOGE:
25215 case ISD::SETGE: Swap = true; [[fallthrough]];
25216 case ISD::SETLE:
25217 case ISD::SETOLE: SSECC = 2; break;
25218 case ISD::SETUO: SSECC = 3; break;
25219 case ISD::SETUNE:
25220 case ISD::SETNE: SSECC = 4; break;
25221 case ISD::SETULE: Swap = true; [[fallthrough]];
25222 case ISD::SETUGE: SSECC = 5; break;
25223 case ISD::SETULT: Swap = true; [[fallthrough]];
25224 case ISD::SETUGT: SSECC = 6; break;
25225 case ISD::SETO: SSECC = 7; break;
25226 case ISD::SETUEQ: SSECC = 8; break;
25227 case ISD::SETONE: SSECC = 12; break;
25228 }
25229 if (Swap)
25230 std::swap(Op0, Op1);
25231
25232 switch (SetCCOpcode) {
25233 default:
25234 IsAlwaysSignaling = true;
25235 break;
25236 case ISD::SETEQ:
25237 case ISD::SETOEQ:
25238 case ISD::SETUEQ:
25239 case ISD::SETNE:
25240 case ISD::SETONE:
25241 case ISD::SETUNE:
25242 case ISD::SETO:
25243 case ISD::SETUO:
25244 IsAlwaysSignaling = false;
25245 break;
25246 }
25247
25248 return SSECC;
25249}
25250
25251/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
25252/// concatenate the result back.
25253static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
25254 ISD::CondCode Cond, SelectionDAG &DAG,
25255 const SDLoc &dl) {
25256 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__
__PRETTY_FUNCTION__))
25257 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__
__PRETTY_FUNCTION__))
;
25258
25259 SDValue CC = DAG.getCondCode(Cond);
25260
25261 // Extract the LHS Lo/Hi vectors
25262 SDValue LHS1, LHS2;
25263 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
25264
25265 // Extract the RHS Lo/Hi vectors
25266 SDValue RHS1, RHS2;
25267 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
25268
25269 // Issue the operation on the smaller types and concatenate the result back
25270 EVT LoVT, HiVT;
25271 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
25272 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25273 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
25274 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
25275}
25276
25277static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
25278
25279 SDValue Op0 = Op.getOperand(0);
25280 SDValue Op1 = Op.getOperand(1);
25281 SDValue CC = Op.getOperand(2);
25282 MVT VT = Op.getSimpleValueType();
25283 SDLoc dl(Op);
25284
25285 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__
__PRETTY_FUNCTION__))
25286 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__
__PRETTY_FUNCTION__))
;
25287
25288 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
25289
25290 // Prefer SETGT over SETLT.
25291 if (SetCCOpcode == ISD::SETLT) {
25292 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
25293 std::swap(Op0, Op1);
25294 }
25295
25296 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
25297}
25298
25299/// Given a buildvector constant, return a new vector constant with each element
25300/// incremented or decremented. If incrementing or decrementing would result in
25301/// unsigned overflow or underflow or this is not a simple vector constant,
25302/// return an empty value.
25303static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,
25304 bool NSW) {
25305 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
25306 if (!BV || !V.getValueType().isSimple())
25307 return SDValue();
25308
25309 MVT VT = V.getSimpleValueType();
25310 MVT EltVT = VT.getVectorElementType();
25311 unsigned NumElts = VT.getVectorNumElements();
25312 SmallVector<SDValue, 8> NewVecC;
25313 SDLoc DL(V);
25314 for (unsigned i = 0; i < NumElts; ++i) {
25315 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
25316 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
25317 return SDValue();
25318
25319 // Avoid overflow/underflow.
25320 const APInt &EltC = Elt->getAPIntValue();
25321 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
25322 return SDValue();
25323 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
25324 (!IsInc && EltC.isMinSignedValue())))
25325 return SDValue();
25326
25327 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
25328 }
25329
25330 return DAG.getBuildVector(VT, DL, NewVecC);
25331}
25332
25333/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
25334/// Op0 u<= Op1:
25335/// t = psubus Op0, Op1
25336/// pcmpeq t, <0..0>
25337static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
25338 ISD::CondCode Cond, const SDLoc &dl,
25339 const X86Subtarget &Subtarget,
25340 SelectionDAG &DAG) {
25341 if (!Subtarget.hasSSE2())
25342 return SDValue();
25343
25344 MVT VET = VT.getVectorElementType();
25345 if (VET != MVT::i8 && VET != MVT::i16)
25346 return SDValue();
25347
25348 switch (Cond) {
25349 default:
25350 return SDValue();
25351 case ISD::SETULT: {
25352 // If the comparison is against a constant we can turn this into a
25353 // setule. With psubus, setule does not require a swap. This is
25354 // beneficial because the constant in the register is no longer
25355 // destructed as the destination so it can be hoisted out of a loop.
25356 // Only do this pre-AVX since vpcmp* is no longer destructive.
25357 if (Subtarget.hasAVX())
25358 return SDValue();
25359 SDValue ULEOp1 =
25360 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
25361 if (!ULEOp1)
25362 return SDValue();
25363 Op1 = ULEOp1;
25364 break;
25365 }
25366 case ISD::SETUGT: {
25367 // If the comparison is against a constant, we can turn this into a setuge.
25368 // This is beneficial because materializing a constant 0 for the PCMPEQ is
25369 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
25370 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
25371 SDValue UGEOp1 =
25372 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
25373 if (!UGEOp1)
25374 return SDValue();
25375 Op1 = Op0;
25376 Op0 = UGEOp1;
25377 break;
25378 }
25379 // Psubus is better than flip-sign because it requires no inversion.
25380 case ISD::SETUGE:
25381 std::swap(Op0, Op1);
25382 break;
25383 case ISD::SETULE:
25384 break;
25385 }
25386
25387 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
25388 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
25389 DAG.getConstant(0, dl, VT));
25390}
25391
25392static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
25393 SelectionDAG &DAG) {
25394 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25395 Op.getOpcode() == ISD::STRICT_FSETCCS;
25396 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25397 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25398 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
25399 MVT VT = Op->getSimpleValueType(0);
25400 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
25401 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
25402 SDLoc dl(Op);
25403
25404 if (isFP) {
25405 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
25406 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25406, __extension__
__PRETTY_FUNCTION__))
;
25407 if (isSoftFP16(EltVT, Subtarget))
25408 return SDValue();
25409
25410 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25411 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25412
25413 // If we have a strict compare with a vXi1 result and the input is 128/256
25414 // bits we can't use a masked compare unless we have VLX. If we use a wider
25415 // compare like we do for non-strict, we might trigger spurious exceptions
25416 // from the upper elements. Instead emit a AVX compare and convert to mask.
25417 unsigned Opc;
25418 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
25419 (!IsStrict || Subtarget.hasVLX() ||
25420 Op0.getSimpleValueType().is512BitVector())) {
25421#ifndef NDEBUG
25422 unsigned Num = VT.getVectorNumElements();
25423 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25423, __extension__
__PRETTY_FUNCTION__))
;
25424#endif
25425 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
25426 } else {
25427 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
25428 // The SSE/AVX packed FP comparison nodes are defined with a
25429 // floating-point vector result that matches the operand type. This allows
25430 // them to work with an SSE1 target (integer vector types are not legal).
25431 VT = Op0.getSimpleValueType();
25432 }
25433
25434 SDValue Cmp;
25435 bool IsAlwaysSignaling;
25436 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
25437 if (!Subtarget.hasAVX()) {
25438 // TODO: We could use following steps to handle a quiet compare with
25439 // signaling encodings.
25440 // 1. Get ordered masks from a quiet ISD::SETO
25441 // 2. Use the masks to mask potential unordered elements in operand A, B
25442 // 3. Get the compare results of masked A, B
25443 // 4. Calculating final result using the mask and result from 3
25444 // But currently, we just fall back to scalar operations.
25445 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
25446 return SDValue();
25447
25448 // Insert an extra signaling instruction to raise exception.
25449 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
25450 SDValue SignalCmp = DAG.getNode(
25451 Opc, dl, {VT, MVT::Other},
25452 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
25453 // FIXME: It seems we need to update the flags of all new strict nodes.
25454 // Otherwise, mayRaiseFPException in MI will return false due to
25455 // NoFPExcept = false by default. However, I didn't find it in other
25456 // patches.
25457 SignalCmp->setFlags(Op->getFlags());
25458 Chain = SignalCmp.getValue(1);
25459 }
25460
25461 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
25462 // emit two comparisons and a logic op to tie them together.
25463 if (!cheapX86FSETCC_SSE(Cond)) {
25464 // LLVM predicate is SETUEQ or SETONE.
25465 unsigned CC0, CC1;
25466 unsigned CombineOpc;
25467 if (Cond == ISD::SETUEQ) {
25468 CC0 = 3; // UNORD
25469 CC1 = 0; // EQ
25470 CombineOpc = X86ISD::FOR;
25471 } else {
25472 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25472, __extension__ __PRETTY_FUNCTION__))
;
25473 CC0 = 7; // ORD
25474 CC1 = 4; // NEQ
25475 CombineOpc = X86ISD::FAND;
25476 }
25477
25478 SDValue Cmp0, Cmp1;
25479 if (IsStrict) {
25480 Cmp0 = DAG.getNode(
25481 Opc, dl, {VT, MVT::Other},
25482 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
25483 Cmp1 = DAG.getNode(
25484 Opc, dl, {VT, MVT::Other},
25485 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
25486 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
25487 Cmp1.getValue(1));
25488 } else {
25489 Cmp0 = DAG.getNode(
25490 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
25491 Cmp1 = DAG.getNode(
25492 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
25493 }
25494 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
25495 } else {
25496 if (IsStrict) {
25497 Cmp = DAG.getNode(
25498 Opc, dl, {VT, MVT::Other},
25499 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25500 Chain = Cmp.getValue(1);
25501 } else
25502 Cmp = DAG.getNode(
25503 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25504 }
25505 } else {
25506 // Handle all other FP comparisons here.
25507 if (IsStrict) {
25508 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
25509 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
25510 Cmp = DAG.getNode(
25511 Opc, dl, {VT, MVT::Other},
25512 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25513 Chain = Cmp.getValue(1);
25514 } else
25515 Cmp = DAG.getNode(
25516 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25517 }
25518
25519 if (VT.getFixedSizeInBits() >
25520 Op.getSimpleValueType().getFixedSizeInBits()) {
25521 // We emitted a compare with an XMM/YMM result. Finish converting to a
25522 // mask register using a vptestm.
25523 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
25524 Cmp = DAG.getBitcast(CastVT, Cmp);
25525 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
25526 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
25527 } else {
25528 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
25529 // the result type of SETCC. The bitcast is expected to be optimized
25530 // away during combining/isel.
25531 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
25532 }
25533
25534 if (IsStrict)
25535 return DAG.getMergeValues({Cmp, Chain}, dl);
25536
25537 return Cmp;
25538 }
25539
25540 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25540, __extension__
__PRETTY_FUNCTION__))
;
25541
25542 MVT VTOp0 = Op0.getSimpleValueType();
25543 (void)VTOp0;
25544 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__
__PRETTY_FUNCTION__))
25545 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__
__PRETTY_FUNCTION__))
;
25546 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__
__PRETTY_FUNCTION__))
25547 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__
__PRETTY_FUNCTION__))
;
25548
25549 // The non-AVX512 code below works under the assumption that source and
25550 // destination types are the same.
25551 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__
__PRETTY_FUNCTION__))
25552 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__
__PRETTY_FUNCTION__))
;
25553
25554 // The result is boolean, but operands are int/float
25555 if (VT.getVectorElementType() == MVT::i1) {
25556 // In AVX-512 architecture setcc returns mask with i1 elements,
25557 // But there is no compare instruction for i8 and i16 elements in KNL.
25558 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__
__PRETTY_FUNCTION__))
25559 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__
__PRETTY_FUNCTION__))
;
25560 return LowerIntVSETCC_AVX512(Op, DAG);
25561 }
25562
25563 // Lower using XOP integer comparisons.
25564 if (VT.is128BitVector() && Subtarget.hasXOP()) {
25565 // Translate compare code to XOP PCOM compare mode.
25566 unsigned CmpMode = 0;
25567 switch (Cond) {
25568 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25568)
;
25569 case ISD::SETULT:
25570 case ISD::SETLT: CmpMode = 0x00; break;
25571 case ISD::SETULE:
25572 case ISD::SETLE: CmpMode = 0x01; break;
25573 case ISD::SETUGT:
25574 case ISD::SETGT: CmpMode = 0x02; break;
25575 case ISD::SETUGE:
25576 case ISD::SETGE: CmpMode = 0x03; break;
25577 case ISD::SETEQ: CmpMode = 0x04; break;
25578 case ISD::SETNE: CmpMode = 0x05; break;
25579 }
25580
25581 // Are we comparing unsigned or signed integers?
25582 unsigned Opc =
25583 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
25584
25585 return DAG.getNode(Opc, dl, VT, Op0, Op1,
25586 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
25587 }
25588
25589 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
25590 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
25591 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
25592 SDValue BC0 = peekThroughBitcasts(Op0);
25593 if (BC0.getOpcode() == ISD::AND) {
25594 APInt UndefElts;
25595 SmallVector<APInt, 64> EltBits;
25596 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
25597 VT.getScalarSizeInBits(), UndefElts,
25598 EltBits, false, false)) {
25599 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
25600 Cond = ISD::SETEQ;
25601 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
25602 }
25603 }
25604 }
25605 }
25606
25607 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
25608 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
25609 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
25610 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
25611 if (C1 && C1->getAPIntValue().isPowerOf2()) {
25612 unsigned BitWidth = VT.getScalarSizeInBits();
25613 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
25614
25615 SDValue Result = Op0.getOperand(0);
25616 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
25617 DAG.getConstant(ShiftAmt, dl, VT));
25618 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
25619 DAG.getConstant(BitWidth - 1, dl, VT));
25620 return Result;
25621 }
25622 }
25623
25624 // Break 256-bit integer vector compare into smaller ones.
25625 if (VT.is256BitVector() && !Subtarget.hasInt256())
25626 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25627
25628 // Break 512-bit integer vector compare into smaller ones.
25629 // TODO: Try harder to use VPCMPx + VPMOV2x?
25630 if (VT.is512BitVector())
25631 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25632
25633 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
25634 // not-of-PCMPEQ:
25635 // X != INT_MIN --> X >s INT_MIN
25636 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
25637 // +X != 0 --> +X >s 0
25638 APInt ConstValue;
25639 if (Cond == ISD::SETNE &&
25640 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
25641 if (ConstValue.isMinSignedValue())
25642 Cond = ISD::SETGT;
25643 else if (ConstValue.isMaxSignedValue())
25644 Cond = ISD::SETLT;
25645 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
25646 Cond = ISD::SETGT;
25647 }
25648
25649 // If both operands are known non-negative, then an unsigned compare is the
25650 // same as a signed compare and there's no need to flip signbits.
25651 // TODO: We could check for more general simplifications here since we're
25652 // computing known bits.
25653 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
25654 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
25655
25656 // Special case: Use min/max operations for unsigned compares.
25657 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25658 if (ISD::isUnsignedIntSetCC(Cond) &&
25659 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
25660 TLI.isOperationLegal(ISD::UMIN, VT)) {
25661 // If we have a constant operand, increment/decrement it and change the
25662 // condition to avoid an invert.
25663 if (Cond == ISD::SETUGT) {
25664 // X > C --> X >= (C+1) --> X == umax(X, C+1)
25665 if (SDValue UGTOp1 =
25666 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
25667 Op1 = UGTOp1;
25668 Cond = ISD::SETUGE;
25669 }
25670 }
25671 if (Cond == ISD::SETULT) {
25672 // X < C --> X <= (C-1) --> X == umin(X, C-1)
25673 if (SDValue ULTOp1 =
25674 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
25675 Op1 = ULTOp1;
25676 Cond = ISD::SETULE;
25677 }
25678 }
25679 bool Invert = false;
25680 unsigned Opc;
25681 switch (Cond) {
25682 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25682)
;
25683 case ISD::SETUGT: Invert = true; [[fallthrough]];
25684 case ISD::SETULE: Opc = ISD::UMIN; break;
25685 case ISD::SETULT: Invert = true; [[fallthrough]];
25686 case ISD::SETUGE: Opc = ISD::UMAX; break;
25687 }
25688
25689 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25690 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
25691
25692 // If the logical-not of the result is required, perform that now.
25693 if (Invert)
25694 Result = DAG.getNOT(dl, Result, VT);
25695
25696 return Result;
25697 }
25698
25699 // Try to use SUBUS and PCMPEQ.
25700 if (FlipSigns)
25701 if (SDValue V =
25702 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
25703 return V;
25704
25705 // We are handling one of the integer comparisons here. Since SSE only has
25706 // GT and EQ comparisons for integer, swapping operands and multiple
25707 // operations may be required for some comparisons.
25708 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
25709 : X86ISD::PCMPGT;
25710 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
25711 Cond == ISD::SETGE || Cond == ISD::SETUGE;
25712 bool Invert = Cond == ISD::SETNE ||
25713 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
25714
25715 if (Swap)
25716 std::swap(Op0, Op1);
25717
25718 // Check that the operation in question is available (most are plain SSE2,
25719 // but PCMPGTQ and PCMPEQQ have different requirements).
25720 if (VT == MVT::v2i64) {
25721 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
25722 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25722, __extension__
__PRETTY_FUNCTION__))
;
25723
25724 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
25725 // the odd elements over the even elements.
25726 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
25727 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
25728 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25729
25730 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25731 static const int MaskHi[] = { 1, 1, 3, 3 };
25732 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25733
25734 return DAG.getBitcast(VT, Result);
25735 }
25736
25737 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
25738 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25739 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
25740
25741 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25742 static const int MaskHi[] = { 1, 1, 3, 3 };
25743 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25744
25745 return DAG.getBitcast(VT, Result);
25746 }
25747
25748 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25749 // bits of the inputs before performing those operations. The lower
25750 // compare is always unsigned.
25751 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
25752 : 0x0000000080000000ULL,
25753 dl, MVT::v2i64);
25754
25755 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
25756 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
25757
25758 // Cast everything to the right type.
25759 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25760 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25761
25762 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
25763 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25764 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
25765
25766 // Create masks for only the low parts/high parts of the 64 bit integers.
25767 static const int MaskHi[] = { 1, 1, 3, 3 };
25768 static const int MaskLo[] = { 0, 0, 2, 2 };
25769 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
25770 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
25771 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25772
25773 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
25774 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
25775
25776 if (Invert)
25777 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25778
25779 return DAG.getBitcast(VT, Result);
25780 }
25781
25782 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
25783 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
25784 // pcmpeqd + pshufd + pand.
25785 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25785, __extension__
__PRETTY_FUNCTION__))
;
25786
25787 // First cast everything to the right type.
25788 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25789 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25790
25791 // Do the compare.
25792 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
25793
25794 // Make sure the lower and upper halves are both all-ones.
25795 static const int Mask[] = { 1, 0, 3, 2 };
25796 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
25797 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
25798
25799 if (Invert)
25800 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25801
25802 return DAG.getBitcast(VT, Result);
25803 }
25804 }
25805
25806 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25807 // bits of the inputs before performing those operations.
25808 if (FlipSigns) {
25809 MVT EltVT = VT.getVectorElementType();
25810 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
25811 VT);
25812 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
25813 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
25814 }
25815
25816 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25817
25818 // If the logical-not of the result is required, perform that now.
25819 if (Invert)
25820 Result = DAG.getNOT(dl, Result, VT);
25821
25822 return Result;
25823}
25824
25825// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
25826static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
25827 const SDLoc &dl, SelectionDAG &DAG,
25828 const X86Subtarget &Subtarget,
25829 SDValue &X86CC) {
25830 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25830, __extension__
__PRETTY_FUNCTION__))
;
25831
25832 // Must be a bitcast from vXi1.
25833 if (Op0.getOpcode() != ISD::BITCAST)
25834 return SDValue();
25835
25836 Op0 = Op0.getOperand(0);
25837 MVT VT = Op0.getSimpleValueType();
25838 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
25839 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
25840 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
25841 return SDValue();
25842
25843 X86::CondCode X86Cond;
25844 if (isNullConstant(Op1)) {
25845 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
25846 } else if (isAllOnesConstant(Op1)) {
25847 // C flag is set for all ones.
25848 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
25849 } else
25850 return SDValue();
25851
25852 // If the input is an AND, we can combine it's operands into the KTEST.
25853 bool KTestable = false;
25854 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
25855 KTestable = true;
25856 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
25857 KTestable = true;
25858 if (!isNullConstant(Op1))
25859 KTestable = false;
25860 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
25861 SDValue LHS = Op0.getOperand(0);
25862 SDValue RHS = Op0.getOperand(1);
25863 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25864 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
25865 }
25866
25867 // If the input is an OR, we can combine it's operands into the KORTEST.
25868 SDValue LHS = Op0;
25869 SDValue RHS = Op0;
25870 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
25871 LHS = Op0.getOperand(0);
25872 RHS = Op0.getOperand(1);
25873 }
25874
25875 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25876 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
25877}
25878
25879/// Emit flags for the given setcc condition and operands. Also returns the
25880/// corresponding X86 condition code constant in X86CC.
25881SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
25882 ISD::CondCode CC, const SDLoc &dl,
25883 SelectionDAG &DAG,
25884 SDValue &X86CC) const {
25885 // Equality Combines.
25886 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
25887 X86::CondCode X86CondCode;
25888
25889 // Optimize to BT if possible.
25890 // Lower (X & (1 << N)) == 0 to BT(X, N).
25891 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
25892 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
25893 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
25894 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
25895 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25896 return BT;
25897 }
25898 }
25899
25900 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
25901 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
25902 X86CondCode)) {
25903 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25904 return CmpZ;
25905 }
25906
25907 // Try to lower using KORTEST or KTEST.
25908 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
25909 return Test;
25910
25911 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
25912 // of these.
25913 if (isOneConstant(Op1) || isNullConstant(Op1)) {
25914 // If the input is a setcc, then reuse the input setcc or use a new one
25915 // with the inverted condition.
25916 if (Op0.getOpcode() == X86ISD::SETCC) {
25917 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
25918
25919 X86CC = Op0.getOperand(0);
25920 if (Invert) {
25921 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
25922 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
25923 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25924 }
25925
25926 return Op0.getOperand(1);
25927 }
25928 }
25929
25930 // Try to use the carry flag from the add in place of an separate CMP for:
25931 // (seteq (add X, -1), -1). Similar for setne.
25932 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
25933 Op0.getOperand(1) == Op1) {
25934 if (isProfitableToUseFlagOp(Op0)) {
25935 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
25936
25937 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
25938 Op0.getOperand(1));
25939 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
25940 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25941 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25942 return SDValue(New.getNode(), 1);
25943 }
25944 }
25945 }
25946
25947 X86::CondCode CondCode =
25948 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
25949 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25949, __extension__
__PRETTY_FUNCTION__))
;
25950
25951 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
25952 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25953 return EFLAGS;
25954}
25955
25956SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
25957
25958 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25959 Op.getOpcode() == ISD::STRICT_FSETCCS;
25960 MVT VT = Op->getSimpleValueType(0);
25961
25962 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
25963
25964 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25964, __extension__
__PRETTY_FUNCTION__))
;
25965 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25966 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25967 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25968 SDLoc dl(Op);
25969 ISD::CondCode CC =
25970 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
25971
25972 if (isSoftFP16(Op0.getValueType()))
25973 return SDValue();
25974
25975 // Handle f128 first, since one possible outcome is a normal integer
25976 // comparison which gets handled by emitFlagsForSetcc.
25977 if (Op0.getValueType() == MVT::f128) {
25978 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
25979 Op.getOpcode() == ISD::STRICT_FSETCCS);
25980
25981 // If softenSetCCOperands returned a scalar, use it.
25982 if (!Op1.getNode()) {
25983 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__
__PRETTY_FUNCTION__))
25984 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__
__PRETTY_FUNCTION__))
;
25985 if (IsStrict)
25986 return DAG.getMergeValues({Op0, Chain}, dl);
25987 return Op0;
25988 }
25989 }
25990
25991 if (Op0.getSimpleValueType().isInteger()) {
25992 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
25993 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
25994 // this may translate to less uops depending on uarch implementation. The
25995 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
25996 // canonicalize to that CondCode.
25997 // NOTE: Only do this if incrementing the constant doesn't increase the bit
25998 // encoding size - so it must either already be a i8 or i32 immediate, or it
25999 // shrinks down to that. We don't do this for any i64's to avoid additional
26000 // constant materializations.
26001 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
26002 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
26003 const APInt &Op1Val = Op1C->getAPIntValue();
26004 if (!Op1Val.isZero()) {
26005 // Ensure the constant+1 doesn't overflow.
26006 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
26007 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
26008 APInt Op1ValPlusOne = Op1Val + 1;
26009 if (Op1ValPlusOne.isSignedIntN(32) &&
26010 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
26011 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
26012 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
26013 : ISD::CondCode::SETUGE;
26014 }
26015 }
26016 }
26017 }
26018
26019 SDValue X86CC;
26020 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
26021 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
26022 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
26023 }
26024
26025 // Handle floating point.
26026 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
26027 if (CondCode == X86::COND_INVALID)
26028 return SDValue();
26029
26030 SDValue EFLAGS;
26031 if (IsStrict) {
26032 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
26033 EFLAGS =
26034 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
26035 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
26036 Chain = EFLAGS.getValue(1);
26037 } else {
26038 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
26039 }
26040
26041 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
26042 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
26043 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
26044}
26045
26046SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
26047 SDValue LHS = Op.getOperand(0);
26048 SDValue RHS = Op.getOperand(1);
26049 SDValue Carry = Op.getOperand(2);
26050 SDValue Cond = Op.getOperand(3);
26051 SDLoc DL(Op);
26052
26053 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26053, __extension__
__PRETTY_FUNCTION__))
;
26054 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
26055
26056 // Recreate the carry if needed.
26057 EVT CarryVT = Carry.getValueType();
26058 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
26059 Carry, DAG.getAllOnesConstant(DL, CarryVT));
26060
26061 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
26062 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
26063 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
26064}
26065
26066// This function returns three things: the arithmetic computation itself
26067// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
26068// flag and the condition code define the case in which the arithmetic
26069// computation overflows.
26070static std::pair<SDValue, SDValue>
26071getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
26072 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26072, __extension__
__PRETTY_FUNCTION__))
;
26073 SDValue Value, Overflow;
26074 SDValue LHS = Op.getOperand(0);
26075 SDValue RHS = Op.getOperand(1);
26076 unsigned BaseOp = 0;
26077 SDLoc DL(Op);
26078 switch (Op.getOpcode()) {
26079 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 26079)
;
26080 case ISD::SADDO:
26081 BaseOp = X86ISD::ADD;
26082 Cond = X86::COND_O;
26083 break;
26084 case ISD::UADDO:
26085 BaseOp = X86ISD::ADD;
26086 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
26087 break;
26088 case ISD::SSUBO:
26089 BaseOp = X86ISD::SUB;
26090 Cond = X86::COND_O;
26091 break;
26092 case ISD::USUBO:
26093 BaseOp = X86ISD::SUB;
26094 Cond = X86::COND_B;
26095 break;
26096 case ISD::SMULO:
26097 BaseOp = X86ISD::SMUL;
26098 Cond = X86::COND_O;
26099 break;
26100 case ISD::UMULO:
26101 BaseOp = X86ISD::UMUL;
26102 Cond = X86::COND_O;
26103 break;
26104 }
26105
26106 if (BaseOp) {
26107 // Also sets EFLAGS.
26108 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
26109 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
26110 Overflow = Value.getValue(1);
26111 }
26112
26113 return std::make_pair(Value, Overflow);
26114}
26115
26116static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
26117 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
26118 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
26119 // looks for this combo and may remove the "setcc" instruction if the "setcc"
26120 // has only one use.
26121 SDLoc DL(Op);
26122 X86::CondCode Cond;
26123 SDValue Value, Overflow;
26124 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
26125
26126 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
26127 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26127, __extension__
__PRETTY_FUNCTION__))
;
26128 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
26129}
26130
26131/// Return true if opcode is a X86 logical comparison.
26132static bool isX86LogicalCmp(SDValue Op) {
26133 unsigned Opc = Op.getOpcode();
26134 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
26135 Opc == X86ISD::FCMP)
26136 return true;
26137 if (Op.getResNo() == 1 &&
26138 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
26139 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
26140 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
26141 return true;
26142
26143 return false;
26144}
26145
26146static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
26147 if (V.getOpcode() != ISD::TRUNCATE)
26148 return false;
26149
26150 SDValue VOp0 = V.getOperand(0);
26151 unsigned InBits = VOp0.getValueSizeInBits();
26152 unsigned Bits = V.getValueSizeInBits();
26153 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
26154}
26155
26156SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
26157 bool AddTest = true;
26158 SDValue Cond = Op.getOperand(0);
26159 SDValue Op1 = Op.getOperand(1);
26160 SDValue Op2 = Op.getOperand(2);
26161 SDLoc DL(Op);
26162 MVT VT = Op1.getSimpleValueType();
26163 SDValue CC;
26164
26165 if (isSoftFP16(VT)) {
26166 MVT NVT = VT.changeTypeToInteger();
26167 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
26168 DAG.getBitcast(NVT, Op1),
26169 DAG.getBitcast(NVT, Op2)));
26170 }
26171
26172 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
26173 // are available or VBLENDV if AVX is available.
26174 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
26175 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
26176 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
26177 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
26178 bool IsAlwaysSignaling;
26179 unsigned SSECC =
26180 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
26181 CondOp0, CondOp1, IsAlwaysSignaling);
26182
26183 if (Subtarget.hasAVX512()) {
26184 SDValue Cmp =
26185 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
26186 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26187 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26187, __extension__
__PRETTY_FUNCTION__))
;
26188 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26189 }
26190
26191 if (SSECC < 8 || Subtarget.hasAVX()) {
26192 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
26193 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26194
26195 // If we have AVX, we can use a variable vector select (VBLENDV) instead
26196 // of 3 logic instructions for size savings and potentially speed.
26197 // Unfortunately, there is no scalar form of VBLENDV.
26198
26199 // If either operand is a +0.0 constant, don't try this. We can expect to
26200 // optimize away at least one of the logic instructions later in that
26201 // case, so that sequence would be faster than a variable blend.
26202
26203 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
26204 // uses XMM0 as the selection register. That may need just as many
26205 // instructions as the AND/ANDN/OR sequence due to register moves, so
26206 // don't bother.
26207 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
26208 !isNullFPConstant(Op2)) {
26209 // Convert to vectors, do a VSELECT, and convert back to scalar.
26210 // All of the conversions should be optimized away.
26211 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
26212 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
26213 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
26214 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
26215
26216 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
26217 VCmp = DAG.getBitcast(VCmpVT, VCmp);
26218
26219 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
26220
26221 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
26222 VSel, DAG.getIntPtrConstant(0, DL));
26223 }
26224 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
26225 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
26226 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
26227 }
26228 }
26229
26230 // AVX512 fallback is to lower selects of scalar floats to masked moves.
26231 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
26232 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
26233 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26234 }
26235
26236 if (Cond.getOpcode() == ISD::SETCC &&
26237 !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
26238 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
26239 Cond = NewCond;
26240 // If the condition was updated, it's possible that the operands of the
26241 // select were also updated (for example, EmitTest has a RAUW). Refresh
26242 // the local references to the select operands in case they got stale.
26243 Op1 = Op.getOperand(1);
26244 Op2 = Op.getOperand(2);
26245 }
26246 }
26247
26248 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
26249 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
26250 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
26251 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
26252 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
26253 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
26254 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26255 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26256 if (Cond.getOpcode() == X86ISD::SETCC &&
26257 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
26258 isNullConstant(Cond.getOperand(1).getOperand(1))) {
26259 SDValue Cmp = Cond.getOperand(1);
26260 SDValue CmpOp0 = Cmp.getOperand(0);
26261 unsigned CondCode = Cond.getConstantOperandVal(0);
26262
26263 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
26264 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
26265 // handle to keep the CMP with 0. This should be removed by
26266 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
26267 // cttz_zero_undef.
26268 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
26269 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
26270 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
26271 };
26272 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
26273 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
26274 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
26275 // Keep Cmp.
26276 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26277 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
26278 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
26279 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
26280
26281 // 'X - 1' sets the carry flag if X == 0.
26282 // '0 - X' sets the carry flag if X != 0.
26283 // Convert the carry flag to a -1/0 mask with sbb:
26284 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
26285 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
26286 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
26287 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
26288 SDValue Sub;
26289 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
26290 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
26291 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
26292 } else {
26293 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
26294 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
26295 }
26296 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26297 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
26298 Sub.getValue(1));
26299 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
26300 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
26301 CmpOp0.getOpcode() == ISD::AND &&
26302 isOneConstant(CmpOp0.getOperand(1))) {
26303 SDValue Src1, Src2;
26304 // true if Op2 is XOR or OR operator and one of its operands
26305 // is equal to Op1
26306 // ( a , a op b) || ( b , a op b)
26307 auto isOrXorPattern = [&]() {
26308 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
26309 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
26310 Src1 =
26311 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
26312 Src2 = Op1;
26313 return true;
26314 }
26315 return false;
26316 };
26317
26318 if (isOrXorPattern()) {
26319 SDValue Neg;
26320 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
26321 // we need mask of all zeros or ones with same size of the other
26322 // operands.
26323 if (CmpSz > VT.getSizeInBits())
26324 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
26325 else if (CmpSz < VT.getSizeInBits())
26326 Neg = DAG.getNode(ISD::AND, DL, VT,
26327 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
26328 DAG.getConstant(1, DL, VT));
26329 else
26330 Neg = CmpOp0;
26331 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
26332 Neg); // -(and (x, 0x1))
26333 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
26334 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
26335 }
26336 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
26337 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
26338 ((CondCode == X86::COND_S) || // smin(x, 0)
26339 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
26340 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26341 //
26342 // If the comparison is testing for a positive value, we have to invert
26343 // the sign bit mask, so only do that transform if the target has a
26344 // bitwise 'and not' instruction (the invert is free).
26345 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26346 unsigned ShCt = VT.getSizeInBits() - 1;
26347 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
26348 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
26349 if (CondCode == X86::COND_G)
26350 Shift = DAG.getNOT(DL, Shift, VT);
26351 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
26352 }
26353 }
26354
26355 // Look past (and (setcc_carry (cmp ...)), 1).
26356 if (Cond.getOpcode() == ISD::AND &&
26357 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
26358 isOneConstant(Cond.getOperand(1)))
26359 Cond = Cond.getOperand(0);
26360
26361 // If condition flag is set by a X86ISD::CMP, then use it as the condition
26362 // setting operand in place of the X86ISD::SETCC.
26363 unsigned CondOpcode = Cond.getOpcode();
26364 if (CondOpcode == X86ISD::SETCC ||
26365 CondOpcode == X86ISD::SETCC_CARRY) {
26366 CC = Cond.getOperand(0);
26367
26368 SDValue Cmp = Cond.getOperand(1);
26369 bool IllegalFPCMov = false;
26370 if (VT.isFloatingPoint() && !VT.isVector() &&
26371 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
26372 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
26373
26374 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
26375 Cmp.getOpcode() == X86ISD::BT) { // FIXME
26376 Cond = Cmp;
26377 AddTest = false;
26378 }
26379 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
26380 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
26381 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
26382 SDValue Value;
26383 X86::CondCode X86Cond;
26384 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26385
26386 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
26387 AddTest = false;
26388 }
26389
26390 if (AddTest) {
26391 // Look past the truncate if the high bits are known zero.
26392 if (isTruncWithZeroHighBitsInput(Cond, DAG))
26393 Cond = Cond.getOperand(0);
26394
26395 // We know the result of AND is compared against zero. Try to match
26396 // it to BT.
26397 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
26398 X86::CondCode X86CondCode;
26399 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
26400 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
26401 Cond = BT;
26402 AddTest = false;
26403 }
26404 }
26405 }
26406
26407 if (AddTest) {
26408 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
26409 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
26410 }
26411
26412 // a < b ? -1 : 0 -> RES = ~setcc_carry
26413 // a < b ? 0 : -1 -> RES = setcc_carry
26414 // a >= b ? -1 : 0 -> RES = setcc_carry
26415 // a >= b ? 0 : -1 -> RES = ~setcc_carry
26416 if (Cond.getOpcode() == X86ISD::SUB) {
26417 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
26418
26419 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
26420 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26421 (isNullConstant(Op1) || isNullConstant(Op2))) {
26422 SDValue Res =
26423 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
26424 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
26425 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
26426 return DAG.getNOT(DL, Res, Res.getValueType());
26427 return Res;
26428 }
26429 }
26430
26431 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
26432 // widen the cmov and push the truncate through. This avoids introducing a new
26433 // branch during isel and doesn't add any extensions.
26434 if (Op.getValueType() == MVT::i8 &&
26435 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
26436 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
26437 if (T1.getValueType() == T2.getValueType() &&
26438 // Exclude CopyFromReg to avoid partial register stalls.
26439 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
26440 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
26441 CC, Cond);
26442 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26443 }
26444 }
26445
26446 // Or finally, promote i8 cmovs if we have CMOV,
26447 // or i16 cmovs if it won't prevent folding a load.
26448 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
26449 // legal, but EmitLoweredSelect() can not deal with these extensions
26450 // being inserted between two CMOV's. (in i16 case too TBN)
26451 // https://bugs.llvm.org/show_bug.cgi?id=40974
26452 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
26453 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
26454 !X86::mayFoldLoad(Op2, Subtarget))) {
26455 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
26456 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
26457 SDValue Ops[] = { Op2, Op1, CC, Cond };
26458 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
26459 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26460 }
26461
26462 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
26463 // condition is true.
26464 SDValue Ops[] = { Op2, Op1, CC, Cond };
26465 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
26466}
26467
26468static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
26469 const X86Subtarget &Subtarget,
26470 SelectionDAG &DAG) {
26471 MVT VT = Op->getSimpleValueType(0);
26472 SDValue In = Op->getOperand(0);
26473 MVT InVT = In.getSimpleValueType();
26474 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26474, __extension__
__PRETTY_FUNCTION__))
;
26475 MVT VTElt = VT.getVectorElementType();
26476 SDLoc dl(Op);
26477
26478 unsigned NumElts = VT.getVectorNumElements();
26479
26480 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
26481 MVT ExtVT = VT;
26482 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
26483 // If v16i32 is to be avoided, we'll need to split and concatenate.
26484 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
26485 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
26486
26487 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
26488 }
26489
26490 // Widen to 512-bits if VLX is not supported.
26491 MVT WideVT = ExtVT;
26492 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
26493 NumElts *= 512 / ExtVT.getSizeInBits();
26494 InVT = MVT::getVectorVT(MVT::i1, NumElts);
26495 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
26496 In, DAG.getIntPtrConstant(0, dl));
26497 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
26498 }
26499
26500 SDValue V;
26501 MVT WideEltVT = WideVT.getVectorElementType();
26502 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
26503 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
26504 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
26505 } else {
26506 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
26507 SDValue Zero = DAG.getConstant(0, dl, WideVT);
26508 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
26509 }
26510
26511 // Truncate if we had to extend i16/i8 above.
26512 if (VT != ExtVT) {
26513 WideVT = MVT::getVectorVT(VTElt, NumElts);
26514 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
26515 }
26516
26517 // Extract back to 128/256-bit if we widened.
26518 if (WideVT != VT)
26519 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
26520 DAG.getIntPtrConstant(0, dl));
26521
26522 return V;
26523}
26524
26525static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26526 SelectionDAG &DAG) {
26527 SDValue In = Op->getOperand(0);
26528 MVT InVT = In.getSimpleValueType();
26529
26530 if (InVT.getVectorElementType() == MVT::i1)
26531 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26532
26533 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26533, __extension__
__PRETTY_FUNCTION__))
;
26534 return LowerAVXExtend(Op, DAG, Subtarget);
26535}
26536
26537// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
26538// For sign extend this needs to handle all vector sizes and SSE4.1 and
26539// non-SSE4.1 targets. For zero extend this should only handle inputs of
26540// MVT::v64i8 when BWI is not supported, but AVX512 is.
26541static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
26542 const X86Subtarget &Subtarget,
26543 SelectionDAG &DAG) {
26544 SDValue In = Op->getOperand(0);
26545 MVT VT = Op->getSimpleValueType(0);
26546 MVT InVT = In.getSimpleValueType();
26547
26548 MVT SVT = VT.getVectorElementType();
26549 MVT InSVT = InVT.getVectorElementType();
26550 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26550, __extension__
__PRETTY_FUNCTION__))
;
26551
26552 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
26553 return SDValue();
26554 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
26555 return SDValue();
26556 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
26557 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
26558 !(VT.is512BitVector() && Subtarget.hasAVX512()))
26559 return SDValue();
26560
26561 SDLoc dl(Op);
26562 unsigned Opc = Op.getOpcode();
26563 unsigned NumElts = VT.getVectorNumElements();
26564
26565 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
26566 // For 512-bit vectors, we need 128-bits or 256-bits.
26567 if (InVT.getSizeInBits() > 128) {
26568 // Input needs to be at least the same number of elements as output, and
26569 // at least 128-bits.
26570 int InSize = InSVT.getSizeInBits() * NumElts;
26571 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
26572 InVT = In.getSimpleValueType();
26573 }
26574
26575 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
26576 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
26577 // need to be handled here for 256/512-bit results.
26578 if (Subtarget.hasInt256()) {
26579 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26579, __extension__
__PRETTY_FUNCTION__))
;
26580
26581 if (InVT.getVectorNumElements() != NumElts)
26582 return DAG.getNode(Op.getOpcode(), dl, VT, In);
26583
26584 // FIXME: Apparently we create inreg operations that could be regular
26585 // extends.
26586 unsigned ExtOpc =
26587 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
26588 : ISD::ZERO_EXTEND;
26589 return DAG.getNode(ExtOpc, dl, VT, In);
26590 }
26591
26592 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
26593 if (Subtarget.hasAVX()) {
26594 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26594, __extension__
__PRETTY_FUNCTION__))
;
26595 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26596 int HalfNumElts = HalfVT.getVectorNumElements();
26597
26598 unsigned NumSrcElts = InVT.getVectorNumElements();
26599 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
26600 for (int i = 0; i != HalfNumElts; ++i)
26601 HiMask[i] = HalfNumElts + i;
26602
26603 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
26604 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
26605 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
26606 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26607 }
26608
26609 // We should only get here for sign extend.
26610 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26610, __extension__
__PRETTY_FUNCTION__))
;
26611 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26611, __extension__
__PRETTY_FUNCTION__))
;
26612
26613 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
26614 SDValue Curr = In;
26615 SDValue SignExt = Curr;
26616
26617 // As SRAI is only available on i16/i32 types, we expand only up to i32
26618 // and handle i64 separately.
26619 if (InVT != MVT::v4i32) {
26620 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
26621
26622 unsigned DestWidth = DestVT.getScalarSizeInBits();
26623 unsigned Scale = DestWidth / InSVT.getSizeInBits();
26624
26625 unsigned InNumElts = InVT.getVectorNumElements();
26626 unsigned DestElts = DestVT.getVectorNumElements();
26627
26628 // Build a shuffle mask that takes each input element and places it in the
26629 // MSBs of the new element size.
26630 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
26631 for (unsigned i = 0; i != DestElts; ++i)
26632 Mask[i * Scale + (Scale - 1)] = i;
26633
26634 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
26635 Curr = DAG.getBitcast(DestVT, Curr);
26636
26637 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
26638 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
26639 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
26640 }
26641
26642 if (VT == MVT::v2i64) {
26643 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__
__PRETTY_FUNCTION__))
;
26644 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
26645 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
26646 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
26647 SignExt = DAG.getBitcast(VT, SignExt);
26648 }
26649
26650 return SignExt;
26651}
26652
26653static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26654 SelectionDAG &DAG) {
26655 MVT VT = Op->getSimpleValueType(0);
26656 SDValue In = Op->getOperand(0);
26657 MVT InVT = In.getSimpleValueType();
26658 SDLoc dl(Op);
26659
26660 if (InVT.getVectorElementType() == MVT::i1)
26661 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26662
26663 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26663, __extension__
__PRETTY_FUNCTION__))
;
26664 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__
__PRETTY_FUNCTION__))
26665 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__
__PRETTY_FUNCTION__))
;
26666 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
26667 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
26668 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
26669 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
;
26670 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
26671 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
26672 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
26673 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
;
26674
26675 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
26676 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26676, __extension__
__PRETTY_FUNCTION__))
;
26677 return splitVectorIntUnary(Op, DAG);
26678 }
26679
26680 if (Subtarget.hasInt256())
26681 return Op;
26682
26683 // Optimize vectors in AVX mode
26684 // Sign extend v8i16 to v8i32 and
26685 // v4i32 to v4i64
26686 //
26687 // Divide input vector into two parts
26688 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
26689 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
26690 // concat the vectors to original VT
26691 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26692 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
26693
26694 unsigned NumElems = InVT.getVectorNumElements();
26695 SmallVector<int,8> ShufMask(NumElems, -1);
26696 for (unsigned i = 0; i != NumElems/2; ++i)
26697 ShufMask[i] = i + NumElems/2;
26698
26699 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26700 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
26701
26702 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
26703}
26704
26705/// Change a vector store into a pair of half-size vector stores.
26706static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
26707 SDValue StoredVal = Store->getValue();
26708 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))
26709 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))
26710 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))
;
26711
26712 // Splitting volatile memory ops is not allowed unless the operation was not
26713 // legal to begin with. Assume the input store is legal (this transform is
26714 // only used for targets with AVX). Note: It is possible that we have an
26715 // illegal type like v2i128, and so we could allow splitting a volatile store
26716 // in that case if that is important.
26717 if (!Store->isSimple())
26718 return SDValue();
26719
26720 SDLoc DL(Store);
26721 SDValue Value0, Value1;
26722 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
26723 unsigned HalfOffset = Value0.getValueType().getStoreSize();
26724 SDValue Ptr0 = Store->getBasePtr();
26725 SDValue Ptr1 =
26726 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
26727 SDValue Ch0 =
26728 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
26729 Store->getOriginalAlign(),
26730 Store->getMemOperand()->getFlags());
26731 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
26732 Store->getPointerInfo().getWithOffset(HalfOffset),
26733 Store->getOriginalAlign(),
26734 Store->getMemOperand()->getFlags());
26735 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
26736}
26737
26738/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
26739/// type.
26740static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
26741 SelectionDAG &DAG) {
26742 SDValue StoredVal = Store->getValue();
26743 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__
__PRETTY_FUNCTION__))
26744 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__
__PRETTY_FUNCTION__))
;
26745 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
26746
26747 // Splitting volatile memory ops is not allowed unless the operation was not
26748 // legal to begin with. We are assuming the input op is legal (this transform
26749 // is only used for targets with AVX).
26750 if (!Store->isSimple())
26751 return SDValue();
26752
26753 MVT StoreSVT = StoreVT.getScalarType();
26754 unsigned NumElems = StoreVT.getVectorNumElements();
26755 unsigned ScalarSize = StoreSVT.getStoreSize();
26756
26757 SDLoc DL(Store);
26758 SmallVector<SDValue, 4> Stores;
26759 for (unsigned i = 0; i != NumElems; ++i) {
26760 unsigned Offset = i * ScalarSize;
26761 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
26762 TypeSize::Fixed(Offset), DL);
26763 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
26764 DAG.getIntPtrConstant(i, DL));
26765 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
26766 Store->getPointerInfo().getWithOffset(Offset),
26767 Store->getOriginalAlign(),
26768 Store->getMemOperand()->getFlags());
26769 Stores.push_back(Ch);
26770 }
26771 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
26772}
26773
26774static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
26775 SelectionDAG &DAG) {
26776 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
26777 SDLoc dl(St);
26778 SDValue StoredVal = St->getValue();
26779
26780 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
26781 if (StoredVal.getValueType().isVector() &&
26782 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
26783 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
26784 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26784, __extension__
__PRETTY_FUNCTION__))
;
26785 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26785, __extension__
__PRETTY_FUNCTION__))
;
26786 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__
__PRETTY_FUNCTION__))
26787 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__
__PRETTY_FUNCTION__))
;
26788
26789 // We must pad with zeros to ensure we store zeroes to any unused bits.
26790 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26791 DAG.getUNDEF(MVT::v16i1), StoredVal,
26792 DAG.getIntPtrConstant(0, dl));
26793 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
26794 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
26795 // Make sure we store zeros in the extra bits.
26796 if (NumElts < 8)
26797 StoredVal = DAG.getZeroExtendInReg(
26798 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
26799
26800 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26801 St->getPointerInfo(), St->getOriginalAlign(),
26802 St->getMemOperand()->getFlags());
26803 }
26804
26805 if (St->isTruncatingStore())
26806 return SDValue();
26807
26808 // If this is a 256-bit store of concatenated ops, we are better off splitting
26809 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
26810 // and each half can execute independently. Some cores would split the op into
26811 // halves anyway, so the concat (vinsertf128) is purely an extra op.
26812 MVT StoreVT = StoredVal.getSimpleValueType();
26813 if (StoreVT.is256BitVector() ||
26814 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
26815 !Subtarget.hasBWI())) {
26816 SmallVector<SDValue, 4> CatOps;
26817 if (StoredVal.hasOneUse() &&
26818 collectConcatOps(StoredVal.getNode(), CatOps, DAG))
26819 return splitVectorStore(St, DAG);
26820 return SDValue();
26821 }
26822
26823 if (StoreVT.is32BitVector())
26824 return SDValue();
26825
26826 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26827 assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26827, __extension__
__PRETTY_FUNCTION__))
;
26828 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))
26829 TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))
26830 "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))
;
26831
26832 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
26833 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
26834 DAG.getUNDEF(StoreVT));
26835
26836 if (Subtarget.hasSSE2()) {
26837 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
26838 // and store it.
26839 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
26840 MVT CastVT = MVT::getVectorVT(StVT, 2);
26841 StoredVal = DAG.getBitcast(CastVT, StoredVal);
26842 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
26843 DAG.getIntPtrConstant(0, dl));
26844
26845 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26846 St->getPointerInfo(), St->getOriginalAlign(),
26847 St->getMemOperand()->getFlags());
26848 }
26849 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26849, __extension__
__PRETTY_FUNCTION__))
;
26850 SDVTList Tys = DAG.getVTList(MVT::Other);
26851 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
26852 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
26853 St->getMemOperand());
26854}
26855
26856// Lower vector extended loads using a shuffle. If SSSE3 is not available we
26857// may emit an illegal shuffle but the expansion is still better than scalar
26858// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
26859// we'll emit a shuffle and a arithmetic shift.
26860// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
26861// TODO: It is possible to support ZExt by zeroing the undef values during
26862// the shuffle phase or after the shuffle.
26863static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
26864 SelectionDAG &DAG) {
26865 MVT RegVT = Op.getSimpleValueType();
26866 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26866, __extension__
__PRETTY_FUNCTION__))
;
26867 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__
__PRETTY_FUNCTION__))
26868 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__
__PRETTY_FUNCTION__))
;
26869
26870 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
26871 SDLoc dl(Ld);
26872
26873 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
26874 if (RegVT.getVectorElementType() == MVT::i1) {
26875 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26875, __extension__
__PRETTY_FUNCTION__))
;
26876 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26876, __extension__
__PRETTY_FUNCTION__))
;
26877 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__
__PRETTY_FUNCTION__))
26878 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__
__PRETTY_FUNCTION__))
;
26879
26880 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
26881 Ld->getPointerInfo(), Ld->getOriginalAlign(),
26882 Ld->getMemOperand()->getFlags());
26883
26884 // Replace chain users with the new chain.
26885 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26885, __extension__
__PRETTY_FUNCTION__))
;
26886
26887 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
26888 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
26889 DAG.getBitcast(MVT::v16i1, Val),
26890 DAG.getIntPtrConstant(0, dl));
26891 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
26892 }
26893
26894 return SDValue();
26895}
26896
26897/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
26898/// each of which has no other use apart from the AND / OR.
26899static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
26900 Opc = Op.getOpcode();
26901 if (Opc != ISD::OR && Opc != ISD::AND)
26902 return false;
26903 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
26904 Op.getOperand(0).hasOneUse() &&
26905 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
26906 Op.getOperand(1).hasOneUse());
26907}
26908
26909SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
26910 SDValue Chain = Op.getOperand(0);
26911 SDValue Cond = Op.getOperand(1);
26912 SDValue Dest = Op.getOperand(2);
26913 SDLoc dl(Op);
26914
26915 // Bail out when we don't have native compare instructions.
26916 if (Cond.getOpcode() == ISD::SETCC &&
26917 Cond.getOperand(0).getValueType() != MVT::f128 &&
26918 !isSoftFP16(Cond.getOperand(0).getValueType())) {
26919 SDValue LHS = Cond.getOperand(0);
26920 SDValue RHS = Cond.getOperand(1);
26921 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26922
26923 // Special case for
26924 // setcc([su]{add,sub,mul}o == 0)
26925 // setcc([su]{add,sub,mul}o != 1)
26926 if (ISD::isOverflowIntrOpRes(LHS) &&
26927 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
26928 (isNullConstant(RHS) || isOneConstant(RHS))) {
26929 SDValue Value, Overflow;
26930 X86::CondCode X86Cond;
26931 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
26932
26933 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
26934 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
26935
26936 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26937 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26938 Overflow);
26939 }
26940
26941 if (LHS.getSimpleValueType().isInteger()) {
26942 SDValue CCVal;
26943 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
26944 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26945 EFLAGS);
26946 }
26947
26948 if (CC == ISD::SETOEQ) {
26949 // For FCMP_OEQ, we can emit
26950 // two branches instead of an explicit AND instruction with a
26951 // separate test. However, we only do this if this block doesn't
26952 // have a fall-through edge, because this requires an explicit
26953 // jmp when the condition is false.
26954 if (Op.getNode()->hasOneUse()) {
26955 SDNode *User = *Op.getNode()->use_begin();
26956 // Look for an unconditional branch following this conditional branch.
26957 // We need this because we need to reverse the successors in order
26958 // to implement FCMP_OEQ.
26959 if (User->getOpcode() == ISD::BR) {
26960 SDValue FalseBB = User->getOperand(1);
26961 SDNode *NewBR =
26962 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
26963 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26963, __extension__ __PRETTY_FUNCTION__))
;
26964 (void)NewBR;
26965 Dest = FalseBB;
26966
26967 SDValue Cmp =
26968 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26969 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26970 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
26971 CCVal, Cmp);
26972 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26973 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26974 Cmp);
26975 }
26976 }
26977 } else if (CC == ISD::SETUNE) {
26978 // For FCMP_UNE, we can emit
26979 // two branches instead of an explicit OR instruction with a
26980 // separate test.
26981 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26982 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26983 Chain =
26984 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
26985 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26986 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26987 Cmp);
26988 } else {
26989 X86::CondCode X86Cond =
26990 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
26991 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26992 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26993 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26994 Cmp);
26995 }
26996 }
26997
26998 if (ISD::isOverflowIntrOpRes(Cond)) {
26999 SDValue Value, Overflow;
27000 X86::CondCode X86Cond;
27001 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
27002
27003 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
27004 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27005 Overflow);
27006 }
27007
27008 // Look past the truncate if the high bits are known zero.
27009 if (isTruncWithZeroHighBitsInput(Cond, DAG))
27010 Cond = Cond.getOperand(0);
27011
27012 EVT CondVT = Cond.getValueType();
27013
27014 // Add an AND with 1 if we don't already have one.
27015 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
27016 Cond =
27017 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
27018
27019 SDValue LHS = Cond;
27020 SDValue RHS = DAG.getConstant(0, dl, CondVT);
27021
27022 SDValue CCVal;
27023 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
27024 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27025 EFLAGS);
27026}
27027
27028// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
27029// Calls to _alloca are needed to probe the stack when allocating more than 4k
27030// bytes in one go. Touching the stack at 4K increments is necessary to ensure
27031// that the guard pages used by the OS virtual memory manager are allocated in
27032// correct sequence.
27033SDValue
27034X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
27035 SelectionDAG &DAG) const {
27036 MachineFunction &MF = DAG.getMachineFunction();
27037 bool SplitStack = MF.shouldSplitStack();
27038 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
27039 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
27040 SplitStack || EmitStackProbeCall;
27041 SDLoc dl(Op);
27042
27043 // Get the inputs.
27044 SDNode *Node = Op.getNode();
27045 SDValue Chain = Op.getOperand(0);
27046 SDValue Size = Op.getOperand(1);
27047 MaybeAlign Alignment(Op.getConstantOperandVal(2));
27048 EVT VT = Node->getValueType(0);
27049
27050 // Chain the dynamic stack allocation so that it doesn't modify the stack
27051 // pointer when other instructions are using the stack.
27052 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
27053
27054 bool Is64Bit = Subtarget.is64Bit();
27055 MVT SPTy = getPointerTy(DAG.getDataLayout());
27056
27057 SDValue Result;
27058 if (!Lower) {
27059 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27060 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
27061 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__
__PRETTY_FUNCTION__))
27062 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__
__PRETTY_FUNCTION__))
;
27063
27064 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27065 const Align StackAlign = TFI.getStackAlign();
27066 if (hasInlineStackProbe(MF)) {
27067 MachineRegisterInfo &MRI = MF.getRegInfo();
27068
27069 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27070 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27071 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27072 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
27073 DAG.getRegister(Vreg, SPTy));
27074 } else {
27075 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
27076 Chain = SP.getValue(1);
27077 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
27078 }
27079 if (Alignment && *Alignment > StackAlign)
27080 Result =
27081 DAG.getNode(ISD::AND, dl, VT, Result,
27082 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27083 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
27084 } else if (SplitStack) {
27085 MachineRegisterInfo &MRI = MF.getRegInfo();
27086
27087 if (Is64Bit) {
27088 // The 64 bit implementation of segmented stacks needs to clobber both r10
27089 // r11. This makes it impossible to use it along with nested parameters.
27090 const Function &F = MF.getFunction();
27091 for (const auto &A : F.args()) {
27092 if (A.hasNestAttr())
27093 report_fatal_error("Cannot use segmented stacks with functions that "
27094 "have nested arguments.");
27095 }
27096 }
27097
27098 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27099 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27100 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27101 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
27102 DAG.getRegister(Vreg, SPTy));
27103 } else {
27104 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
27105 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
27106 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
27107
27108 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27109 Register SPReg = RegInfo->getStackRegister();
27110 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
27111 Chain = SP.getValue(1);
27112
27113 if (Alignment) {
27114 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
27115 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27116 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
27117 }
27118
27119 Result = SP;
27120 }
27121
27122 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
27123
27124 SDValue Ops[2] = {Result, Chain};
27125 return DAG.getMergeValues(Ops, dl);
27126}
27127
27128SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
27129 MachineFunction &MF = DAG.getMachineFunction();
27130 auto PtrVT = getPointerTy(MF.getDataLayout());
27131 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27132
27133 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27134 SDLoc DL(Op);
27135
27136 if (!Subtarget.is64Bit() ||
27137 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
27138 // vastart just stores the address of the VarArgsFrameIndex slot into the
27139 // memory location argument.
27140 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27141 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
27142 MachinePointerInfo(SV));
27143 }
27144
27145 // __va_list_tag:
27146 // gp_offset (0 - 6 * 8)
27147 // fp_offset (48 - 48 + 8 * 16)
27148 // overflow_arg_area (point to parameters coming in memory).
27149 // reg_save_area
27150 SmallVector<SDValue, 8> MemOps;
27151 SDValue FIN = Op.getOperand(1);
27152 // Store gp_offset
27153 SDValue Store = DAG.getStore(
27154 Op.getOperand(0), DL,
27155 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
27156 MachinePointerInfo(SV));
27157 MemOps.push_back(Store);
27158
27159 // Store fp_offset
27160 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
27161 Store = DAG.getStore(
27162 Op.getOperand(0), DL,
27163 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
27164 MachinePointerInfo(SV, 4));
27165 MemOps.push_back(Store);
27166
27167 // Store ptr to overflow_arg_area
27168 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
27169 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27170 Store =
27171 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
27172 MemOps.push_back(Store);
27173
27174 // Store ptr to reg_save_area.
27175 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
27176 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
27177 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
27178 Store = DAG.getStore(
27179 Op.getOperand(0), DL, RSFIN, FIN,
27180 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
27181 MemOps.push_back(Store);
27182 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
27183}
27184
27185SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
27186 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__
__PRETTY_FUNCTION__))
27187 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__
__PRETTY_FUNCTION__))
;
27188 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27188, __extension__ __PRETTY_FUNCTION__))
;
27189
27190 MachineFunction &MF = DAG.getMachineFunction();
27191 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
27192 // The Win64 ABI uses char* instead of a structure.
27193 return DAG.expandVAArg(Op.getNode());
27194
27195 SDValue Chain = Op.getOperand(0);
27196 SDValue SrcPtr = Op.getOperand(1);
27197 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27198 unsigned Align = Op.getConstantOperandVal(3);
27199 SDLoc dl(Op);
27200
27201 EVT ArgVT = Op.getNode()->getValueType(0);
27202 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27203 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
27204 uint8_t ArgMode;
27205
27206 // Decide which area this value should be read from.
27207 // TODO: Implement the AMD64 ABI in its entirety. This simple
27208 // selection mechanism works only for the basic types.
27209 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27209, __extension__
__PRETTY_FUNCTION__))
;
27210 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
27211 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
27212 } else {
27213 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__
__PRETTY_FUNCTION__))
27214 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__
__PRETTY_FUNCTION__))
;
27215 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
27216 }
27217
27218 if (ArgMode == 2) {
27219 // Make sure using fp_offset makes sense.
27220 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))
27221 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))
27222 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))
;
27223 }
27224
27225 // Insert VAARG node into the DAG
27226 // VAARG returns two values: Variable Argument Address, Chain
27227 SDValue InstOps[] = {Chain, SrcPtr,
27228 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
27229 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
27230 DAG.getTargetConstant(Align, dl, MVT::i32)};
27231 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
27232 SDValue VAARG = DAG.getMemIntrinsicNode(
27233 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
27234 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
27235 /*Alignment=*/std::nullopt,
27236 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
27237 Chain = VAARG.getValue(1);
27238
27239 // Load the next argument and return it
27240 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
27241}
27242
27243static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
27244 SelectionDAG &DAG) {
27245 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
27246 // where a va_list is still an i8*.
27247 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27247, __extension__
__PRETTY_FUNCTION__))
;
27248 if (Subtarget.isCallingConvWin64(
27249 DAG.getMachineFunction().getFunction().getCallingConv()))
27250 // Probably a Win64 va_copy.
27251 return DAG.expandVACopy(Op.getNode());
27252
27253 SDValue Chain = Op.getOperand(0);
27254 SDValue DstPtr = Op.getOperand(1);
27255 SDValue SrcPtr = Op.getOperand(2);
27256 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
27257 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27258 SDLoc DL(Op);
27259
27260 return DAG.getMemcpy(
27261 Chain, DL, DstPtr, SrcPtr,
27262 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
27263 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
27264 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
27265}
27266
27267// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
27268static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
27269 switch (Opc) {
27270 case ISD::SHL:
27271 case X86ISD::VSHL:
27272 case X86ISD::VSHLI:
27273 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
27274 case ISD::SRL:
27275 case X86ISD::VSRL:
27276 case X86ISD::VSRLI:
27277 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
27278 case ISD::SRA:
27279 case X86ISD::VSRA:
27280 case X86ISD::VSRAI:
27281 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
27282 }
27283 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27283)
;
27284}
27285
27286/// Handle vector element shifts where the shift amount is a constant.
27287/// Takes immediate version of shift as input.
27288static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
27289 SDValue SrcOp, uint64_t ShiftAmt,
27290 SelectionDAG &DAG) {
27291 MVT ElementType = VT.getVectorElementType();
27292
27293 // Bitcast the source vector to the output type, this is mainly necessary for
27294 // vXi8/vXi64 shifts.
27295 if (VT != SrcOp.getSimpleValueType())
27296 SrcOp = DAG.getBitcast(VT, SrcOp);
27297
27298 // Fold this packed shift into its first operand if ShiftAmt is 0.
27299 if (ShiftAmt == 0)
27300 return SrcOp;
27301
27302 // Check for ShiftAmt >= element width
27303 if (ShiftAmt >= ElementType.getSizeInBits()) {
27304 if (Opc == X86ISD::VSRAI)
27305 ShiftAmt = ElementType.getSizeInBits() - 1;
27306 else
27307 return DAG.getConstant(0, dl, VT);
27308 }
27309
27310 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__
__PRETTY_FUNCTION__))
27311 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__
__PRETTY_FUNCTION__))
;
27312
27313 // Fold this packed vector shift into a build vector if SrcOp is a
27314 // vector of Constants or UNDEFs.
27315 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
27316 unsigned ShiftOpc;
27317 switch (Opc) {
27318 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27318)
;
27319 case X86ISD::VSHLI:
27320 ShiftOpc = ISD::SHL;
27321 break;
27322 case X86ISD::VSRLI:
27323 ShiftOpc = ISD::SRL;
27324 break;
27325 case X86ISD::VSRAI:
27326 ShiftOpc = ISD::SRA;
27327 break;
27328 }
27329
27330 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
27331 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
27332 return C;
27333 }
27334
27335 return DAG.getNode(Opc, dl, VT, SrcOp,
27336 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
27337}
27338
27339/// Handle vector element shifts by a splat shift amount
27340static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
27341 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
27342 const X86Subtarget &Subtarget,
27343 SelectionDAG &DAG) {
27344 MVT AmtVT = ShAmt.getSimpleValueType();
27345 assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27345, __extension__
__PRETTY_FUNCTION__))
;
27346 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__
__PRETTY_FUNCTION__))
27347 "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__
__PRETTY_FUNCTION__))
;
27348
27349 // Move the splat element to the bottom element.
27350 if (ShAmtIdx != 0) {
27351 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
27352 Mask[0] = ShAmtIdx;
27353 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
27354 }
27355
27356 // Peek through any zext node if we can get back to a 128-bit source.
27357 if (AmtVT.getScalarSizeInBits() == 64 &&
27358 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
27359 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
27360 ShAmt.getOperand(0).getValueType().isSimple() &&
27361 ShAmt.getOperand(0).getValueType().is128BitVector()) {
27362 ShAmt = ShAmt.getOperand(0);
27363 AmtVT = ShAmt.getSimpleValueType();
27364 }
27365
27366 // See if we can mask off the upper elements using the existing source node.
27367 // The shift uses the entire lower 64-bits of the amount vector, so no need to
27368 // do this for vXi64 types.
27369 bool IsMasked = false;
27370 if (AmtVT.getScalarSizeInBits() < 64) {
27371 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
27372 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
27373 // If the shift amount has come from a scalar, then zero-extend the scalar
27374 // before moving to the vector.
27375 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
27376 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
27377 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
27378 AmtVT = MVT::v4i32;
27379 IsMasked = true;
27380 } else if (ShAmt.getOpcode() == ISD::AND) {
27381 // See if the shift amount is already masked (e.g. for rotation modulo),
27382 // then we can zero-extend it by setting all the other mask elements to
27383 // zero.
27384 SmallVector<SDValue> MaskElts(
27385 AmtVT.getVectorNumElements(),
27386 DAG.getConstant(0, dl, AmtVT.getScalarType()));
27387 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
27388 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
27389 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
27390 {ShAmt.getOperand(1), Mask}))) {
27391 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
27392 IsMasked = true;
27393 }
27394 }
27395 }
27396
27397 // Extract if the shift amount vector is larger than 128-bits.
27398 if (AmtVT.getSizeInBits() > 128) {
27399 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
27400 AmtVT = ShAmt.getSimpleValueType();
27401 }
27402
27403 // Zero-extend bottom element to v2i64 vector type, either by extension or
27404 // shuffle masking.
27405 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
27406 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
27407 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
27408 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
27409 } else if (Subtarget.hasSSE41()) {
27410 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
27411 MVT::v2i64, ShAmt);
27412 } else {
27413 SDValue ByteShift = DAG.getTargetConstant(
27414 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
27415 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
27416 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27417 ByteShift);
27418 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27419 ByteShift);
27420 }
27421 }
27422
27423 // Change opcode to non-immediate version.
27424 Opc = getTargetVShiftUniformOpcode(Opc, true);
27425
27426 // The return type has to be a 128-bit type with the same element
27427 // type as the input type.
27428 MVT EltVT = VT.getVectorElementType();
27429 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
27430
27431 ShAmt = DAG.getBitcast(ShVT, ShAmt);
27432 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
27433}
27434
27435/// Return Mask with the necessary casting or extending
27436/// for \p Mask according to \p MaskVT when lowering masking intrinsics
27437static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
27438 const X86Subtarget &Subtarget, SelectionDAG &DAG,
27439 const SDLoc &dl) {
27440
27441 if (isAllOnesConstant(Mask))
27442 return DAG.getConstant(1, dl, MaskVT);
27443 if (X86::isZeroNode(Mask))
27444 return DAG.getConstant(0, dl, MaskVT);
27445
27446 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27446, __extension__
__PRETTY_FUNCTION__))
;
27447
27448 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
27449 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27449, __extension__
__PRETTY_FUNCTION__))
;
27450 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27450, __extension__
__PRETTY_FUNCTION__))
;
27451 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
27452 SDValue Lo, Hi;
27453 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
27454 Lo = DAG.getBitcast(MVT::v32i1, Lo);
27455 Hi = DAG.getBitcast(MVT::v32i1, Hi);
27456 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
27457 } else {
27458 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
27459 Mask.getSimpleValueType().getSizeInBits());
27460 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
27461 // are extracted by EXTRACT_SUBVECTOR.
27462 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
27463 DAG.getBitcast(BitcastVT, Mask),
27464 DAG.getIntPtrConstant(0, dl));
27465 }
27466}
27467
27468/// Return (and \p Op, \p Mask) for compare instructions or
27469/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
27470/// necessary casting or extending for \p Mask when lowering masking intrinsics
27471static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
27472 SDValue PreservedSrc,
27473 const X86Subtarget &Subtarget,
27474 SelectionDAG &DAG) {
27475 MVT VT = Op.getSimpleValueType();
27476 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
27477 unsigned OpcodeSelect = ISD::VSELECT;
27478 SDLoc dl(Op);
27479
27480 if (isAllOnesConstant(Mask))
27481 return Op;
27482
27483 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27484
27485 if (PreservedSrc.isUndef())
27486 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27487 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
27488}
27489
27490/// Creates an SDNode for a predicated scalar operation.
27491/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
27492/// The mask is coming as MVT::i8 and it should be transformed
27493/// to MVT::v1i1 while lowering masking intrinsics.
27494/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
27495/// "X86select" instead of "vselect". We just can't create the "vselect" node
27496/// for a scalar instruction.
27497static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
27498 SDValue PreservedSrc,
27499 const X86Subtarget &Subtarget,
27500 SelectionDAG &DAG) {
27501
27502 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
27503 if (MaskConst->getZExtValue() & 0x1)
27504 return Op;
27505
27506 MVT VT = Op.getSimpleValueType();
27507 SDLoc dl(Op);
27508
27509 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27509, __extension__
__PRETTY_FUNCTION__))
;
27510 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
27511 DAG.getBitcast(MVT::v8i1, Mask),
27512 DAG.getIntPtrConstant(0, dl));
27513 if (Op.getOpcode() == X86ISD::FSETCCM ||
27514 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
27515 Op.getOpcode() == X86ISD::VFPCLASSS)
27516 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
27517
27518 if (PreservedSrc.isUndef())
27519 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27520 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
27521}
27522
27523static int getSEHRegistrationNodeSize(const Function *Fn) {
27524 if (!Fn->hasPersonalityFn())
27525 report_fatal_error(
27526 "querying registration node size for function without personality");
27527 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
27528 // WinEHStatePass for the full struct definition.
27529 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
27530 case EHPersonality::MSVC_X86SEH: return 24;
27531 case EHPersonality::MSVC_CXX: return 16;
27532 default: break;
27533 }
27534 report_fatal_error(
27535 "can only recover FP for 32-bit MSVC EH personality functions");
27536}
27537
27538/// When the MSVC runtime transfers control to us, either to an outlined
27539/// function or when returning to a parent frame after catching an exception, we
27540/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
27541/// Here's the math:
27542/// RegNodeBase = EntryEBP - RegNodeSize
27543/// ParentFP = RegNodeBase - ParentFrameOffset
27544/// Subtracting RegNodeSize takes us to the offset of the registration node, and
27545/// subtracting the offset (negative on x86) takes us back to the parent FP.
27546static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
27547 SDValue EntryEBP) {
27548 MachineFunction &MF = DAG.getMachineFunction();
27549 SDLoc dl;
27550
27551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27552 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27553
27554 // It's possible that the parent function no longer has a personality function
27555 // if the exceptional code was optimized away, in which case we just return
27556 // the incoming EBP.
27557 if (!Fn->hasPersonalityFn())
27558 return EntryEBP;
27559
27560 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
27561 // registration, or the .set_setframe offset.
27562 MCSymbol *OffsetSym =
27563 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
27564 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
27565 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
27566 SDValue ParentFrameOffset =
27567 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
27568
27569 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
27570 // prologue to RBP in the parent function.
27571 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
27572 if (Subtarget.is64Bit())
27573 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
27574
27575 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
27576 // RegNodeBase = EntryEBP - RegNodeSize
27577 // ParentFP = RegNodeBase - ParentFrameOffset
27578 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
27579 DAG.getConstant(RegNodeSize, dl, PtrVT));
27580 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
27581}
27582
27583SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
27584 SelectionDAG &DAG) const {
27585 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
27586 auto isRoundModeCurDirection = [](SDValue Rnd) {
27587 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
27588 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
27589
27590 return false;
27591 };
27592 auto isRoundModeSAE = [](SDValue Rnd) {
27593 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27594 unsigned RC = C->getZExtValue();
27595 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27596 // Clear the NO_EXC bit and check remaining bits.
27597 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27598 // As a convenience we allow no other bits or explicitly
27599 // current direction.
27600 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
27601 }
27602 }
27603
27604 return false;
27605 };
27606 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
27607 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27608 RC = C->getZExtValue();
27609 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27610 // Clear the NO_EXC bit and check remaining bits.
27611 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27612 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
27613 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
27614 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
27615 RC == X86::STATIC_ROUNDING::TO_ZERO;
27616 }
27617 }
27618
27619 return false;
27620 };
27621
27622 SDLoc dl(Op);
27623 unsigned IntNo = Op.getConstantOperandVal(0);
27624 MVT VT = Op.getSimpleValueType();
27625 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
27626
27627 // Propagate flags from original node to transformed node(s).
27628 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
27629
27630 if (IntrData) {
27631 switch(IntrData->Type) {
27632 case INTR_TYPE_1OP: {
27633 // We specify 2 possible opcodes for intrinsics with rounding modes.
27634 // First, we check if the intrinsic may have non-default rounding mode,
27635 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27636 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27637 if (IntrWithRoundingModeOpcode != 0) {
27638 SDValue Rnd = Op.getOperand(2);
27639 unsigned RC = 0;
27640 if (isRoundModeSAEToX(Rnd, RC))
27641 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27642 Op.getOperand(1),
27643 DAG.getTargetConstant(RC, dl, MVT::i32));
27644 if (!isRoundModeCurDirection(Rnd))
27645 return SDValue();
27646 }
27647 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27648 Op.getOperand(1));
27649 }
27650 case INTR_TYPE_1OP_SAE: {
27651 SDValue Sae = Op.getOperand(2);
27652
27653 unsigned Opc;
27654 if (isRoundModeCurDirection(Sae))
27655 Opc = IntrData->Opc0;
27656 else if (isRoundModeSAE(Sae))
27657 Opc = IntrData->Opc1;
27658 else
27659 return SDValue();
27660
27661 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
27662 }
27663 case INTR_TYPE_2OP: {
27664 SDValue Src2 = Op.getOperand(2);
27665
27666 // We specify 2 possible opcodes for intrinsics with rounding modes.
27667 // First, we check if the intrinsic may have non-default rounding mode,
27668 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27669 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27670 if (IntrWithRoundingModeOpcode != 0) {
27671 SDValue Rnd = Op.getOperand(3);
27672 unsigned RC = 0;
27673 if (isRoundModeSAEToX(Rnd, RC))
27674 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27675 Op.getOperand(1), Src2,
27676 DAG.getTargetConstant(RC, dl, MVT::i32));
27677 if (!isRoundModeCurDirection(Rnd))
27678 return SDValue();
27679 }
27680
27681 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27682 Op.getOperand(1), Src2);
27683 }
27684 case INTR_TYPE_2OP_SAE: {
27685 SDValue Sae = Op.getOperand(3);
27686
27687 unsigned Opc;
27688 if (isRoundModeCurDirection(Sae))
27689 Opc = IntrData->Opc0;
27690 else if (isRoundModeSAE(Sae))
27691 Opc = IntrData->Opc1;
27692 else
27693 return SDValue();
27694
27695 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
27696 Op.getOperand(2));
27697 }
27698 case INTR_TYPE_3OP:
27699 case INTR_TYPE_3OP_IMM8: {
27700 SDValue Src1 = Op.getOperand(1);
27701 SDValue Src2 = Op.getOperand(2);
27702 SDValue Src3 = Op.getOperand(3);
27703
27704 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
27705 Src3.getValueType() != MVT::i8) {
27706 Src3 = DAG.getTargetConstant(
27707 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
27708 }
27709
27710 // We specify 2 possible opcodes for intrinsics with rounding modes.
27711 // First, we check if the intrinsic may have non-default rounding mode,
27712 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27713 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27714 if (IntrWithRoundingModeOpcode != 0) {
27715 SDValue Rnd = Op.getOperand(4);
27716 unsigned RC = 0;
27717 if (isRoundModeSAEToX(Rnd, RC))
27718 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27719 Src1, Src2, Src3,
27720 DAG.getTargetConstant(RC, dl, MVT::i32));
27721 if (!isRoundModeCurDirection(Rnd))
27722 return SDValue();
27723 }
27724
27725 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27726 {Src1, Src2, Src3});
27727 }
27728 case INTR_TYPE_4OP_IMM8: {
27729 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27729, __extension__
__PRETTY_FUNCTION__))
;
27730 SDValue Src4 = Op.getOperand(4);
27731 if (Src4.getValueType() != MVT::i8) {
27732 Src4 = DAG.getTargetConstant(
27733 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
27734 }
27735
27736 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27737 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
27738 Src4);
27739 }
27740 case INTR_TYPE_1OP_MASK: {
27741 SDValue Src = Op.getOperand(1);
27742 SDValue PassThru = Op.getOperand(2);
27743 SDValue Mask = Op.getOperand(3);
27744 // We add rounding mode to the Node when
27745 // - RC Opcode is specified and
27746 // - RC is not "current direction".
27747 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27748 if (IntrWithRoundingModeOpcode != 0) {
27749 SDValue Rnd = Op.getOperand(4);
27750 unsigned RC = 0;
27751 if (isRoundModeSAEToX(Rnd, RC))
27752 return getVectorMaskingNode(
27753 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27754 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
27755 Mask, PassThru, Subtarget, DAG);
27756 if (!isRoundModeCurDirection(Rnd))
27757 return SDValue();
27758 }
27759 return getVectorMaskingNode(
27760 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
27761 Subtarget, DAG);
27762 }
27763 case INTR_TYPE_1OP_MASK_SAE: {
27764 SDValue Src = Op.getOperand(1);
27765 SDValue PassThru = Op.getOperand(2);
27766 SDValue Mask = Op.getOperand(3);
27767 SDValue Rnd = Op.getOperand(4);
27768
27769 unsigned Opc;
27770 if (isRoundModeCurDirection(Rnd))
27771 Opc = IntrData->Opc0;
27772 else if (isRoundModeSAE(Rnd))
27773 Opc = IntrData->Opc1;
27774 else
27775 return SDValue();
27776
27777 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
27778 Subtarget, DAG);
27779 }
27780 case INTR_TYPE_SCALAR_MASK: {
27781 SDValue Src1 = Op.getOperand(1);
27782 SDValue Src2 = Op.getOperand(2);
27783 SDValue passThru = Op.getOperand(3);
27784 SDValue Mask = Op.getOperand(4);
27785 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27786 // There are 2 kinds of intrinsics in this group:
27787 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
27788 // (2) With rounding mode and sae - 7 operands.
27789 bool HasRounding = IntrWithRoundingModeOpcode != 0;
27790 if (Op.getNumOperands() == (5U + HasRounding)) {
27791 if (HasRounding) {
27792 SDValue Rnd = Op.getOperand(5);
27793 unsigned RC = 0;
27794 if (isRoundModeSAEToX(Rnd, RC))
27795 return getScalarMaskingNode(
27796 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
27797 DAG.getTargetConstant(RC, dl, MVT::i32)),
27798 Mask, passThru, Subtarget, DAG);
27799 if (!isRoundModeCurDirection(Rnd))
27800 return SDValue();
27801 }
27802 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
27803 Src2),
27804 Mask, passThru, Subtarget, DAG);
27805 }
27806
27807 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__
__PRETTY_FUNCTION__))
27808 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__
__PRETTY_FUNCTION__))
;
27809 SDValue RoundingMode = Op.getOperand(5);
27810 unsigned Opc = IntrData->Opc0;
27811 if (HasRounding) {
27812 SDValue Sae = Op.getOperand(6);
27813 if (isRoundModeSAE(Sae))
27814 Opc = IntrWithRoundingModeOpcode;
27815 else if (!isRoundModeCurDirection(Sae))
27816 return SDValue();
27817 }
27818 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
27819 Src2, RoundingMode),
27820 Mask, passThru, Subtarget, DAG);
27821 }
27822 case INTR_TYPE_SCALAR_MASK_RND: {
27823 SDValue Src1 = Op.getOperand(1);
27824 SDValue Src2 = Op.getOperand(2);
27825 SDValue passThru = Op.getOperand(3);
27826 SDValue Mask = Op.getOperand(4);
27827 SDValue Rnd = Op.getOperand(5);
27828
27829 SDValue NewOp;
27830 unsigned RC = 0;
27831 if (isRoundModeCurDirection(Rnd))
27832 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27833 else if (isRoundModeSAEToX(Rnd, RC))
27834 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27835 DAG.getTargetConstant(RC, dl, MVT::i32));
27836 else
27837 return SDValue();
27838
27839 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
27840 }
27841 case INTR_TYPE_SCALAR_MASK_SAE: {
27842 SDValue Src1 = Op.getOperand(1);
27843 SDValue Src2 = Op.getOperand(2);
27844 SDValue passThru = Op.getOperand(3);
27845 SDValue Mask = Op.getOperand(4);
27846 SDValue Sae = Op.getOperand(5);
27847 unsigned Opc;
27848 if (isRoundModeCurDirection(Sae))
27849 Opc = IntrData->Opc0;
27850 else if (isRoundModeSAE(Sae))
27851 Opc = IntrData->Opc1;
27852 else
27853 return SDValue();
27854
27855 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27856 Mask, passThru, Subtarget, DAG);
27857 }
27858 case INTR_TYPE_2OP_MASK: {
27859 SDValue Src1 = Op.getOperand(1);
27860 SDValue Src2 = Op.getOperand(2);
27861 SDValue PassThru = Op.getOperand(3);
27862 SDValue Mask = Op.getOperand(4);
27863 SDValue NewOp;
27864 if (IntrData->Opc1 != 0) {
27865 SDValue Rnd = Op.getOperand(5);
27866 unsigned RC = 0;
27867 if (isRoundModeSAEToX(Rnd, RC))
27868 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27869 DAG.getTargetConstant(RC, dl, MVT::i32));
27870 else if (!isRoundModeCurDirection(Rnd))
27871 return SDValue();
27872 }
27873 if (!NewOp)
27874 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27875 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27876 }
27877 case INTR_TYPE_2OP_MASK_SAE: {
27878 SDValue Src1 = Op.getOperand(1);
27879 SDValue Src2 = Op.getOperand(2);
27880 SDValue PassThru = Op.getOperand(3);
27881 SDValue Mask = Op.getOperand(4);
27882
27883 unsigned Opc = IntrData->Opc0;
27884 if (IntrData->Opc1 != 0) {
27885 SDValue Sae = Op.getOperand(5);
27886 if (isRoundModeSAE(Sae))
27887 Opc = IntrData->Opc1;
27888 else if (!isRoundModeCurDirection(Sae))
27889 return SDValue();
27890 }
27891
27892 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27893 Mask, PassThru, Subtarget, DAG);
27894 }
27895 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
27896 SDValue Src1 = Op.getOperand(1);
27897 SDValue Src2 = Op.getOperand(2);
27898 SDValue Src3 = Op.getOperand(3);
27899 SDValue PassThru = Op.getOperand(4);
27900 SDValue Mask = Op.getOperand(5);
27901 SDValue Sae = Op.getOperand(6);
27902 unsigned Opc;
27903 if (isRoundModeCurDirection(Sae))
27904 Opc = IntrData->Opc0;
27905 else if (isRoundModeSAE(Sae))
27906 Opc = IntrData->Opc1;
27907 else
27908 return SDValue();
27909
27910 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27911 Mask, PassThru, Subtarget, DAG);
27912 }
27913 case INTR_TYPE_3OP_MASK_SAE: {
27914 SDValue Src1 = Op.getOperand(1);
27915 SDValue Src2 = Op.getOperand(2);
27916 SDValue Src3 = Op.getOperand(3);
27917 SDValue PassThru = Op.getOperand(4);
27918 SDValue Mask = Op.getOperand(5);
27919
27920 unsigned Opc = IntrData->Opc0;
27921 if (IntrData->Opc1 != 0) {
27922 SDValue Sae = Op.getOperand(6);
27923 if (isRoundModeSAE(Sae))
27924 Opc = IntrData->Opc1;
27925 else if (!isRoundModeCurDirection(Sae))
27926 return SDValue();
27927 }
27928 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27929 Mask, PassThru, Subtarget, DAG);
27930 }
27931 case BLENDV: {
27932 SDValue Src1 = Op.getOperand(1);
27933 SDValue Src2 = Op.getOperand(2);
27934 SDValue Src3 = Op.getOperand(3);
27935
27936 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
27937 Src3 = DAG.getBitcast(MaskVT, Src3);
27938
27939 // Reverse the operands to match VSELECT order.
27940 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
27941 }
27942 case VPERM_2OP : {
27943 SDValue Src1 = Op.getOperand(1);
27944 SDValue Src2 = Op.getOperand(2);
27945
27946 // Swap Src1 and Src2 in the node creation
27947 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
27948 }
27949 case CFMA_OP_MASKZ:
27950 case CFMA_OP_MASK: {
27951 SDValue Src1 = Op.getOperand(1);
27952 SDValue Src2 = Op.getOperand(2);
27953 SDValue Src3 = Op.getOperand(3);
27954 SDValue Mask = Op.getOperand(4);
27955 MVT VT = Op.getSimpleValueType();
27956
27957 SDValue PassThru = Src3;
27958 if (IntrData->Type == CFMA_OP_MASKZ)
27959 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
27960
27961 // We add rounding mode to the Node when
27962 // - RC Opcode is specified and
27963 // - RC is not "current direction".
27964 SDValue NewOp;
27965 if (IntrData->Opc1 != 0) {
27966 SDValue Rnd = Op.getOperand(5);
27967 unsigned RC = 0;
27968 if (isRoundModeSAEToX(Rnd, RC))
27969 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
27970 DAG.getTargetConstant(RC, dl, MVT::i32));
27971 else if (!isRoundModeCurDirection(Rnd))
27972 return SDValue();
27973 }
27974 if (!NewOp)
27975 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
27976 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27977 }
27978 case IFMA_OP:
27979 // NOTE: We need to swizzle the operands to pass the multiply operands
27980 // first.
27981 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27982 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
27983 case FPCLASSS: {
27984 SDValue Src1 = Op.getOperand(1);
27985 SDValue Imm = Op.getOperand(2);
27986 SDValue Mask = Op.getOperand(3);
27987 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
27988 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
27989 Subtarget, DAG);
27990 // Need to fill with zeros to ensure the bitcast will produce zeroes
27991 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27992 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27993 DAG.getConstant(0, dl, MVT::v8i1),
27994 FPclassMask, DAG.getIntPtrConstant(0, dl));
27995 return DAG.getBitcast(MVT::i8, Ins);
27996 }
27997
27998 case CMP_MASK_CC: {
27999 MVT MaskVT = Op.getSimpleValueType();
28000 SDValue CC = Op.getOperand(3);
28001 SDValue Mask = Op.getOperand(4);
28002 // We specify 2 possible opcodes for intrinsics with rounding modes.
28003 // First, we check if the intrinsic may have non-default rounding mode,
28004 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
28005 if (IntrData->Opc1 != 0) {
28006 SDValue Sae = Op.getOperand(5);
28007 if (isRoundModeSAE(Sae))
28008 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
28009 Op.getOperand(2), CC, Mask, Sae);
28010 if (!isRoundModeCurDirection(Sae))
28011 return SDValue();
28012 }
28013 //default rounding mode
28014 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
28015 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
28016 }
28017 case CMP_MASK_SCALAR_CC: {
28018 SDValue Src1 = Op.getOperand(1);
28019 SDValue Src2 = Op.getOperand(2);
28020 SDValue CC = Op.getOperand(3);
28021 SDValue Mask = Op.getOperand(4);
28022
28023 SDValue Cmp;
28024 if (IntrData->Opc1 != 0) {
28025 SDValue Sae = Op.getOperand(5);
28026 if (isRoundModeSAE(Sae))
28027 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
28028 else if (!isRoundModeCurDirection(Sae))
28029 return SDValue();
28030 }
28031 //default rounding mode
28032 if (!Cmp.getNode())
28033 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
28034
28035 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
28036 Subtarget, DAG);
28037 // Need to fill with zeros to ensure the bitcast will produce zeroes
28038 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28039 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
28040 DAG.getConstant(0, dl, MVT::v8i1),
28041 CmpMask, DAG.getIntPtrConstant(0, dl));
28042 return DAG.getBitcast(MVT::i8, Ins);
28043 }
28044 case COMI: { // Comparison intrinsics
28045 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
28046 SDValue LHS = Op.getOperand(1);
28047 SDValue RHS = Op.getOperand(2);
28048 // Some conditions require the operands to be swapped.
28049 if (CC == ISD::SETLT || CC == ISD::SETLE)
28050 std::swap(LHS, RHS);
28051
28052 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
28053 SDValue SetCC;
28054 switch (CC) {
28055 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
28056 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
28057 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
28058 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
28059 break;
28060 }
28061 case ISD::SETNE: { // (ZF = 1 or PF = 1)
28062 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
28063 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
28064 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
28065 break;
28066 }
28067 case ISD::SETGT: // (CF = 0 and ZF = 0)
28068 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
28069 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
28070 break;
28071 }
28072 case ISD::SETGE: // CF = 0
28073 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
28074 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
28075 break;
28076 default:
28077 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28077)
;
28078 }
28079 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28080 }
28081 case COMI_RM: { // Comparison intrinsics with Sae
28082 SDValue LHS = Op.getOperand(1);
28083 SDValue RHS = Op.getOperand(2);
28084 unsigned CondVal = Op.getConstantOperandVal(3);
28085 SDValue Sae = Op.getOperand(4);
28086
28087 SDValue FCmp;
28088 if (isRoundModeCurDirection(Sae))
28089 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
28090 DAG.getTargetConstant(CondVal, dl, MVT::i8));
28091 else if (isRoundModeSAE(Sae))
28092 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
28093 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
28094 else
28095 return SDValue();
28096 // Need to fill with zeros to ensure the bitcast will produce zeroes
28097 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28098 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
28099 DAG.getConstant(0, dl, MVT::v16i1),
28100 FCmp, DAG.getIntPtrConstant(0, dl));
28101 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
28102 DAG.getBitcast(MVT::i16, Ins));
28103 }
28104 case VSHIFT: {
28105 SDValue SrcOp = Op.getOperand(1);
28106 SDValue ShAmt = Op.getOperand(2);
28107 assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__
__PRETTY_FUNCTION__))
28108 "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__
__PRETTY_FUNCTION__))
;
28109
28110 // Catch shift-by-constant.
28111 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
28112 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
28113 Op.getSimpleValueType(), SrcOp,
28114 CShAmt->getZExtValue(), DAG);
28115
28116 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
28117 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
28118 SrcOp, ShAmt, 0, Subtarget, DAG);
28119 }
28120 case COMPRESS_EXPAND_IN_REG: {
28121 SDValue Mask = Op.getOperand(3);
28122 SDValue DataToCompress = Op.getOperand(1);
28123 SDValue PassThru = Op.getOperand(2);
28124 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
28125 return Op.getOperand(1);
28126
28127 // Avoid false dependency.
28128 if (PassThru.isUndef())
28129 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
28130
28131 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
28132 Mask);
28133 }
28134 case FIXUPIMM:
28135 case FIXUPIMM_MASKZ: {
28136 SDValue Src1 = Op.getOperand(1);
28137 SDValue Src2 = Op.getOperand(2);
28138 SDValue Src3 = Op.getOperand(3);
28139 SDValue Imm = Op.getOperand(4);
28140 SDValue Mask = Op.getOperand(5);
28141 SDValue Passthru = (IntrData->Type == FIXUPIMM)
28142 ? Src1
28143 : getZeroVector(VT, Subtarget, DAG, dl);
28144
28145 unsigned Opc = IntrData->Opc0;
28146 if (IntrData->Opc1 != 0) {
28147 SDValue Sae = Op.getOperand(6);
28148 if (isRoundModeSAE(Sae))
28149 Opc = IntrData->Opc1;
28150 else if (!isRoundModeCurDirection(Sae))
28151 return SDValue();
28152 }
28153
28154 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
28155
28156 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
28157 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28158
28159 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28160 }
28161 case ROUNDP: {
28162 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28162, __extension__
__PRETTY_FUNCTION__))
;
28163 // Clear the upper bits of the rounding immediate so that the legacy
28164 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28165 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
28166 SDValue RoundingMode =
28167 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28168 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28169 Op.getOperand(1), RoundingMode);
28170 }
28171 case ROUNDS: {
28172 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28172, __extension__
__PRETTY_FUNCTION__))
;
28173 // Clear the upper bits of the rounding immediate so that the legacy
28174 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28175 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
28176 SDValue RoundingMode =
28177 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28178 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28179 Op.getOperand(1), Op.getOperand(2), RoundingMode);
28180 }
28181 case BEXTRI: {
28182 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28182, __extension__
__PRETTY_FUNCTION__))
;
28183
28184 uint64_t Imm = Op.getConstantOperandVal(2);
28185 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
28186 Op.getValueType());
28187 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28188 Op.getOperand(1), Control);
28189 }
28190 // ADC/ADCX/SBB
28191 case ADX: {
28192 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
28193 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
28194
28195 SDValue Res;
28196 // If the carry in is zero, then we should just use ADD/SUB instead of
28197 // ADC/SBB.
28198 if (isNullConstant(Op.getOperand(1))) {
28199 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
28200 Op.getOperand(3));
28201 } else {
28202 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
28203 DAG.getConstant(-1, dl, MVT::i8));
28204 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
28205 Op.getOperand(3), GenCF.getValue(1));
28206 }
28207 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
28208 SDValue Results[] = { SetCC, Res };
28209 return DAG.getMergeValues(Results, dl);
28210 }
28211 case CVTPD2PS_MASK:
28212 case CVTPD2DQ_MASK:
28213 case CVTQQ2PS_MASK:
28214 case TRUNCATE_TO_REG: {
28215 SDValue Src = Op.getOperand(1);
28216 SDValue PassThru = Op.getOperand(2);
28217 SDValue Mask = Op.getOperand(3);
28218
28219 if (isAllOnesConstant(Mask))
28220 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28221
28222 MVT SrcVT = Src.getSimpleValueType();
28223 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28224 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28225 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
28226 {Src, PassThru, Mask});
28227 }
28228 case CVTPS2PH_MASK: {
28229 SDValue Src = Op.getOperand(1);
28230 SDValue Rnd = Op.getOperand(2);
28231 SDValue PassThru = Op.getOperand(3);
28232 SDValue Mask = Op.getOperand(4);
28233
28234 unsigned RC = 0;
28235 unsigned Opc = IntrData->Opc0;
28236 bool SAE = Src.getValueType().is512BitVector() &&
28237 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
28238 if (SAE) {
28239 Opc = X86ISD::CVTPS2PH_SAE;
28240 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
28241 }
28242
28243 if (isAllOnesConstant(Mask))
28244 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
28245
28246 if (SAE)
28247 Opc = X86ISD::MCVTPS2PH_SAE;
28248 else
28249 Opc = IntrData->Opc1;
28250 MVT SrcVT = Src.getSimpleValueType();
28251 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28252 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28253 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
28254 }
28255 case CVTNEPS2BF16_MASK: {
28256 SDValue Src = Op.getOperand(1);
28257 SDValue PassThru = Op.getOperand(2);
28258 SDValue Mask = Op.getOperand(3);
28259
28260 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28261 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28262
28263 // Break false dependency.
28264 if (PassThru.isUndef())
28265 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
28266
28267 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
28268 Mask);
28269 }
28270 default:
28271 break;
28272 }
28273 }
28274
28275 switch (IntNo) {
28276 default: return SDValue(); // Don't custom lower most intrinsics.
28277
28278 // ptest and testp intrinsics. The intrinsic these come from are designed to
28279 // return an integer value, not just an instruction so lower it to the ptest
28280 // or testp pattern and a setcc for the result.
28281 case Intrinsic::x86_avx512_ktestc_b:
28282 case Intrinsic::x86_avx512_ktestc_w:
28283 case Intrinsic::x86_avx512_ktestc_d:
28284 case Intrinsic::x86_avx512_ktestc_q:
28285 case Intrinsic::x86_avx512_ktestz_b:
28286 case Intrinsic::x86_avx512_ktestz_w:
28287 case Intrinsic::x86_avx512_ktestz_d:
28288 case Intrinsic::x86_avx512_ktestz_q:
28289 case Intrinsic::x86_sse41_ptestz:
28290 case Intrinsic::x86_sse41_ptestc:
28291 case Intrinsic::x86_sse41_ptestnzc:
28292 case Intrinsic::x86_avx_ptestz_256:
28293 case Intrinsic::x86_avx_ptestc_256:
28294 case Intrinsic::x86_avx_ptestnzc_256:
28295 case Intrinsic::x86_avx_vtestz_ps:
28296 case Intrinsic::x86_avx_vtestc_ps:
28297 case Intrinsic::x86_avx_vtestnzc_ps:
28298 case Intrinsic::x86_avx_vtestz_pd:
28299 case Intrinsic::x86_avx_vtestc_pd:
28300 case Intrinsic::x86_avx_vtestnzc_pd:
28301 case Intrinsic::x86_avx_vtestz_ps_256:
28302 case Intrinsic::x86_avx_vtestc_ps_256:
28303 case Intrinsic::x86_avx_vtestnzc_ps_256:
28304 case Intrinsic::x86_avx_vtestz_pd_256:
28305 case Intrinsic::x86_avx_vtestc_pd_256:
28306 case Intrinsic::x86_avx_vtestnzc_pd_256: {
28307 unsigned TestOpc = X86ISD::PTEST;
28308 X86::CondCode X86CC;
28309 switch (IntNo) {
28310 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28310)
;
28311 case Intrinsic::x86_avx512_ktestc_b:
28312 case Intrinsic::x86_avx512_ktestc_w:
28313 case Intrinsic::x86_avx512_ktestc_d:
28314 case Intrinsic::x86_avx512_ktestc_q:
28315 // CF = 1
28316 TestOpc = X86ISD::KTEST;
28317 X86CC = X86::COND_B;
28318 break;
28319 case Intrinsic::x86_avx512_ktestz_b:
28320 case Intrinsic::x86_avx512_ktestz_w:
28321 case Intrinsic::x86_avx512_ktestz_d:
28322 case Intrinsic::x86_avx512_ktestz_q:
28323 TestOpc = X86ISD::KTEST;
28324 X86CC = X86::COND_E;
28325 break;
28326 case Intrinsic::x86_avx_vtestz_ps:
28327 case Intrinsic::x86_avx_vtestz_pd:
28328 case Intrinsic::x86_avx_vtestz_ps_256:
28329 case Intrinsic::x86_avx_vtestz_pd_256:
28330 TestOpc = X86ISD::TESTP;
28331 [[fallthrough]];
28332 case Intrinsic::x86_sse41_ptestz:
28333 case Intrinsic::x86_avx_ptestz_256:
28334 // ZF = 1
28335 X86CC = X86::COND_E;
28336 break;
28337 case Intrinsic::x86_avx_vtestc_ps:
28338 case Intrinsic::x86_avx_vtestc_pd:
28339 case Intrinsic::x86_avx_vtestc_ps_256:
28340 case Intrinsic::x86_avx_vtestc_pd_256:
28341 TestOpc = X86ISD::TESTP;
28342 [[fallthrough]];
28343 case Intrinsic::x86_sse41_ptestc:
28344 case Intrinsic::x86_avx_ptestc_256:
28345 // CF = 1
28346 X86CC = X86::COND_B;
28347 break;
28348 case Intrinsic::x86_avx_vtestnzc_ps:
28349 case Intrinsic::x86_avx_vtestnzc_pd:
28350 case Intrinsic::x86_avx_vtestnzc_ps_256:
28351 case Intrinsic::x86_avx_vtestnzc_pd_256:
28352 TestOpc = X86ISD::TESTP;
28353 [[fallthrough]];
28354 case Intrinsic::x86_sse41_ptestnzc:
28355 case Intrinsic::x86_avx_ptestnzc_256:
28356 // ZF and CF = 0
28357 X86CC = X86::COND_A;
28358 break;
28359 }
28360
28361 SDValue LHS = Op.getOperand(1);
28362 SDValue RHS = Op.getOperand(2);
28363 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
28364 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
28365 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28366 }
28367
28368 case Intrinsic::x86_sse42_pcmpistria128:
28369 case Intrinsic::x86_sse42_pcmpestria128:
28370 case Intrinsic::x86_sse42_pcmpistric128:
28371 case Intrinsic::x86_sse42_pcmpestric128:
28372 case Intrinsic::x86_sse42_pcmpistrio128:
28373 case Intrinsic::x86_sse42_pcmpestrio128:
28374 case Intrinsic::x86_sse42_pcmpistris128:
28375 case Intrinsic::x86_sse42_pcmpestris128:
28376 case Intrinsic::x86_sse42_pcmpistriz128:
28377 case Intrinsic::x86_sse42_pcmpestriz128: {
28378 unsigned Opcode;
28379 X86::CondCode X86CC;
28380 switch (IntNo) {
28381 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28381)
; // Can't reach here.
28382 case Intrinsic::x86_sse42_pcmpistria128:
28383 Opcode = X86ISD::PCMPISTR;
28384 X86CC = X86::COND_A;
28385 break;
28386 case Intrinsic::x86_sse42_pcmpestria128:
28387 Opcode = X86ISD::PCMPESTR;
28388 X86CC = X86::COND_A;
28389 break;
28390 case Intrinsic::x86_sse42_pcmpistric128:
28391 Opcode = X86ISD::PCMPISTR;
28392 X86CC = X86::COND_B;
28393 break;
28394 case Intrinsic::x86_sse42_pcmpestric128:
28395 Opcode = X86ISD::PCMPESTR;
28396 X86CC = X86::COND_B;
28397 break;
28398 case Intrinsic::x86_sse42_pcmpistrio128:
28399 Opcode = X86ISD::PCMPISTR;
28400 X86CC = X86::COND_O;
28401 break;
28402 case Intrinsic::x86_sse42_pcmpestrio128:
28403 Opcode = X86ISD::PCMPESTR;
28404 X86CC = X86::COND_O;
28405 break;
28406 case Intrinsic::x86_sse42_pcmpistris128:
28407 Opcode = X86ISD::PCMPISTR;
28408 X86CC = X86::COND_S;
28409 break;
28410 case Intrinsic::x86_sse42_pcmpestris128:
28411 Opcode = X86ISD::PCMPESTR;
28412 X86CC = X86::COND_S;
28413 break;
28414 case Intrinsic::x86_sse42_pcmpistriz128:
28415 Opcode = X86ISD::PCMPISTR;
28416 X86CC = X86::COND_E;
28417 break;
28418 case Intrinsic::x86_sse42_pcmpestriz128:
28419 Opcode = X86ISD::PCMPESTR;
28420 X86CC = X86::COND_E;
28421 break;
28422 }
28423 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28424 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28425 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
28426 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
28427 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28428 }
28429
28430 case Intrinsic::x86_sse42_pcmpistri128:
28431 case Intrinsic::x86_sse42_pcmpestri128: {
28432 unsigned Opcode;
28433 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
28434 Opcode = X86ISD::PCMPISTR;
28435 else
28436 Opcode = X86ISD::PCMPESTR;
28437
28438 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28439 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28440 return DAG.getNode(Opcode, dl, VTs, NewOps);
28441 }
28442
28443 case Intrinsic::x86_sse42_pcmpistrm128:
28444 case Intrinsic::x86_sse42_pcmpestrm128: {
28445 unsigned Opcode;
28446 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
28447 Opcode = X86ISD::PCMPISTR;
28448 else
28449 Opcode = X86ISD::PCMPESTR;
28450
28451 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28452 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28453 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
28454 }
28455
28456 case Intrinsic::eh_sjlj_lsda: {
28457 MachineFunction &MF = DAG.getMachineFunction();
28458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28459 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28460 auto &Context = MF.getMMI().getContext();
28461 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
28462 Twine(MF.getFunctionNumber()));
28463 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
28464 DAG.getMCSymbol(S, PtrVT));
28465 }
28466
28467 case Intrinsic::x86_seh_lsda: {
28468 // Compute the symbol for the LSDA. We know it'll get emitted later.
28469 MachineFunction &MF = DAG.getMachineFunction();
28470 SDValue Op1 = Op.getOperand(1);
28471 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
28472 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
28473 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
28474
28475 // Generate a simple absolute symbol reference. This intrinsic is only
28476 // supported on 32-bit Windows, which isn't PIC.
28477 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
28478 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
28479 }
28480
28481 case Intrinsic::eh_recoverfp: {
28482 SDValue FnOp = Op.getOperand(1);
28483 SDValue IncomingFPOp = Op.getOperand(2);
28484 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
28485 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
28486 if (!Fn)
28487 report_fatal_error(
28488 "llvm.eh.recoverfp must take a function as the first argument");
28489 return recoverFramePointer(DAG, Fn, IncomingFPOp);
28490 }
28491
28492 case Intrinsic::localaddress: {
28493 // Returns one of the stack, base, or frame pointer registers, depending on
28494 // which is used to reference local variables.
28495 MachineFunction &MF = DAG.getMachineFunction();
28496 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28497 unsigned Reg;
28498 if (RegInfo->hasBasePointer(MF))
28499 Reg = RegInfo->getBaseRegister();
28500 else { // Handles the SP or FP case.
28501 bool CantUseFP = RegInfo->hasStackRealignment(MF);
28502 if (CantUseFP)
28503 Reg = RegInfo->getPtrSizedStackRegister(MF);
28504 else
28505 Reg = RegInfo->getPtrSizedFrameRegister(MF);
28506 }
28507 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
28508 }
28509 case Intrinsic::x86_avx512_vp2intersect_q_512:
28510 case Intrinsic::x86_avx512_vp2intersect_q_256:
28511 case Intrinsic::x86_avx512_vp2intersect_q_128:
28512 case Intrinsic::x86_avx512_vp2intersect_d_512:
28513 case Intrinsic::x86_avx512_vp2intersect_d_256:
28514 case Intrinsic::x86_avx512_vp2intersect_d_128: {
28515 MVT MaskVT = Op.getSimpleValueType();
28516
28517 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
28518 SDLoc DL(Op);
28519
28520 SDValue Operation =
28521 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
28522 Op->getOperand(1), Op->getOperand(2));
28523
28524 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
28525 MaskVT, Operation);
28526 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
28527 MaskVT, Operation);
28528 return DAG.getMergeValues({Result0, Result1}, DL);
28529 }
28530 case Intrinsic::x86_mmx_pslli_w:
28531 case Intrinsic::x86_mmx_pslli_d:
28532 case Intrinsic::x86_mmx_pslli_q:
28533 case Intrinsic::x86_mmx_psrli_w:
28534 case Intrinsic::x86_mmx_psrli_d:
28535 case Intrinsic::x86_mmx_psrli_q:
28536 case Intrinsic::x86_mmx_psrai_w:
28537 case Intrinsic::x86_mmx_psrai_d: {
28538 SDLoc DL(Op);
28539 SDValue ShAmt = Op.getOperand(2);
28540 // If the argument is a constant, convert it to a target constant.
28541 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
28542 // Clamp out of bounds shift amounts since they will otherwise be masked
28543 // to 8-bits which may make it no longer out of bounds.
28544 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
28545 if (ShiftAmount == 0)
28546 return Op.getOperand(1);
28547
28548 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28549 Op.getOperand(0), Op.getOperand(1),
28550 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
28551 }
28552
28553 unsigned NewIntrinsic;
28554 switch (IntNo) {
28555 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28555)
; // Can't reach here.
28556 case Intrinsic::x86_mmx_pslli_w:
28557 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
28558 break;
28559 case Intrinsic::x86_mmx_pslli_d:
28560 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
28561 break;
28562 case Intrinsic::x86_mmx_pslli_q:
28563 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
28564 break;
28565 case Intrinsic::x86_mmx_psrli_w:
28566 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
28567 break;
28568 case Intrinsic::x86_mmx_psrli_d:
28569 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
28570 break;
28571 case Intrinsic::x86_mmx_psrli_q:
28572 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
28573 break;
28574 case Intrinsic::x86_mmx_psrai_w:
28575 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
28576 break;
28577 case Intrinsic::x86_mmx_psrai_d:
28578 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
28579 break;
28580 }
28581
28582 // The vector shift intrinsics with scalars uses 32b shift amounts but
28583 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
28584 // MMX register.
28585 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
28586 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28587 DAG.getTargetConstant(NewIntrinsic, DL,
28588 getPointerTy(DAG.getDataLayout())),
28589 Op.getOperand(1), ShAmt);
28590 }
28591 case Intrinsic::thread_pointer: {
28592 if (Subtarget.isTargetELF()) {
28593 SDLoc dl(Op);
28594 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28595 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
28596 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
28597 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
28598 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28599 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
28600 }
28601 report_fatal_error(
28602 "Target OS doesn't support __builtin_thread_pointer() yet.");
28603 }
28604 }
28605}
28606
28607static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28608 SDValue Src, SDValue Mask, SDValue Base,
28609 SDValue Index, SDValue ScaleOp, SDValue Chain,
28610 const X86Subtarget &Subtarget) {
28611 SDLoc dl(Op);
28612 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28613 // Scale must be constant.
28614 if (!C)
28615 return SDValue();
28616 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28617 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28618 TLI.getPointerTy(DAG.getDataLayout()));
28619 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
28620 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28621 // If source is undef or we know it won't be used, use a zero vector
28622 // to break register dependency.
28623 // TODO: use undef instead and let BreakFalseDeps deal with it?
28624 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28625 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28626
28627 // Cast mask to an integer type.
28628 Mask = DAG.getBitcast(MaskVT, Mask);
28629
28630 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28631
28632 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28633 SDValue Res =
28634 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28635 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28636 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28637}
28638
28639static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
28640 SDValue Src, SDValue Mask, SDValue Base,
28641 SDValue Index, SDValue ScaleOp, SDValue Chain,
28642 const X86Subtarget &Subtarget) {
28643 MVT VT = Op.getSimpleValueType();
28644 SDLoc dl(Op);
28645 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28646 // Scale must be constant.
28647 if (!C)
28648 return SDValue();
28649 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28650 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28651 TLI.getPointerTy(DAG.getDataLayout()));
28652 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28653 VT.getVectorNumElements());
28654 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28655
28656 // We support two versions of the gather intrinsics. One with scalar mask and
28657 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28658 if (Mask.getValueType() != MaskVT)
28659 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28660
28661 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28662 // If source is undef or we know it won't be used, use a zero vector
28663 // to break register dependency.
28664 // TODO: use undef instead and let BreakFalseDeps deal with it?
28665 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28666 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28667
28668 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28669
28670 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28671 SDValue Res =
28672 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28673 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28674 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28675}
28676
28677static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28678 SDValue Src, SDValue Mask, SDValue Base,
28679 SDValue Index, SDValue ScaleOp, SDValue Chain,
28680 const X86Subtarget &Subtarget) {
28681 SDLoc dl(Op);
28682 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28683 // Scale must be constant.
28684 if (!C)
28685 return SDValue();
28686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28687 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28688 TLI.getPointerTy(DAG.getDataLayout()));
28689 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28690 Src.getSimpleValueType().getVectorNumElements());
28691 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28692
28693 // We support two versions of the scatter intrinsics. One with scalar mask and
28694 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28695 if (Mask.getValueType() != MaskVT)
28696 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28697
28698 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28699
28700 SDVTList VTs = DAG.getVTList(MVT::Other);
28701 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
28702 SDValue Res =
28703 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28704 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28705 return Res;
28706}
28707
28708static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28709 SDValue Mask, SDValue Base, SDValue Index,
28710 SDValue ScaleOp, SDValue Chain,
28711 const X86Subtarget &Subtarget) {
28712 SDLoc dl(Op);
28713 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28714 // Scale must be constant.
28715 if (!C)
28716 return SDValue();
28717 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28718 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28719 TLI.getPointerTy(DAG.getDataLayout()));
28720 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
28721 SDValue Segment = DAG.getRegister(0, MVT::i32);
28722 MVT MaskVT =
28723 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
28724 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28725 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
28726 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
28727 return SDValue(Res, 0);
28728}
28729
28730/// Handles the lowering of builtin intrinsics with chain that return their
28731/// value into registers EDX:EAX.
28732/// If operand ScrReg is a valid register identifier, then operand 2 of N is
28733/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
28734/// TargetOpcode.
28735/// Returns a Glue value which can be used to add extra copy-from-reg if the
28736/// expanded intrinsics implicitly defines extra registers (i.e. not just
28737/// EDX:EAX).
28738static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
28739 SelectionDAG &DAG,
28740 unsigned TargetOpcode,
28741 unsigned SrcReg,
28742 const X86Subtarget &Subtarget,
28743 SmallVectorImpl<SDValue> &Results) {
28744 SDValue Chain = N->getOperand(0);
28745 SDValue Glue;
28746
28747 if (SrcReg) {
28748 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28748, __extension__
__PRETTY_FUNCTION__))
;
28749 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
28750 Glue = Chain.getValue(1);
28751 }
28752
28753 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28754 SDValue N1Ops[] = {Chain, Glue};
28755 SDNode *N1 = DAG.getMachineNode(
28756 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
28757 Chain = SDValue(N1, 0);
28758
28759 // Reads the content of XCR and returns it in registers EDX:EAX.
28760 SDValue LO, HI;
28761 if (Subtarget.is64Bit()) {
28762 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
28763 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
28764 LO.getValue(2));
28765 } else {
28766 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
28767 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
28768 LO.getValue(2));
28769 }
28770 Chain = HI.getValue(1);
28771 Glue = HI.getValue(2);
28772
28773 if (Subtarget.is64Bit()) {
28774 // Merge the two 32-bit values into a 64-bit one.
28775 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
28776 DAG.getConstant(32, DL, MVT::i8));
28777 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
28778 Results.push_back(Chain);
28779 return Glue;
28780 }
28781
28782 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
28783 SDValue Ops[] = { LO, HI };
28784 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
28785 Results.push_back(Pair);
28786 Results.push_back(Chain);
28787 return Glue;
28788}
28789
28790/// Handles the lowering of builtin intrinsics that read the time stamp counter
28791/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
28792/// READCYCLECOUNTER nodes.
28793static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
28794 SelectionDAG &DAG,
28795 const X86Subtarget &Subtarget,
28796 SmallVectorImpl<SDValue> &Results) {
28797 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
28798 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
28799 // and the EAX register is loaded with the low-order 32 bits.
28800 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
28801 /* NoRegister */0, Subtarget,
28802 Results);
28803 if (Opcode != X86::RDTSCP)
28804 return;
28805
28806 SDValue Chain = Results[1];
28807 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
28808 // the ECX register. Add 'ecx' explicitly to the chain.
28809 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
28810 Results[1] = ecx;
28811 Results.push_back(ecx.getValue(1));
28812}
28813
28814static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
28815 SelectionDAG &DAG) {
28816 SmallVector<SDValue, 3> Results;
28817 SDLoc DL(Op);
28818 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
28819 Results);
28820 return DAG.getMergeValues(Results, DL);
28821}
28822
28823static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
28824 MachineFunction &MF = DAG.getMachineFunction();
28825 SDValue Chain = Op.getOperand(0);
28826 SDValue RegNode = Op.getOperand(2);
28827 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28828 if (!EHInfo)
28829 report_fatal_error("EH registrations only live in functions using WinEH");
28830
28831 // Cast the operand to an alloca, and remember the frame index.
28832 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
28833 if (!FINode)
28834 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
28835 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
28836
28837 // Return the chain operand without making any DAG nodes.
28838 return Chain;
28839}
28840
28841static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
28842 MachineFunction &MF = DAG.getMachineFunction();
28843 SDValue Chain = Op.getOperand(0);
28844 SDValue EHGuard = Op.getOperand(2);
28845 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28846 if (!EHInfo)
28847 report_fatal_error("EHGuard only live in functions using WinEH");
28848
28849 // Cast the operand to an alloca, and remember the frame index.
28850 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
28851 if (!FINode)
28852 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
28853 EHInfo->EHGuardFrameIndex = FINode->getIndex();
28854
28855 // Return the chain operand without making any DAG nodes.
28856 return Chain;
28857}
28858
28859/// Emit Truncating Store with signed or unsigned saturation.
28860static SDValue
28861EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
28862 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
28863 SelectionDAG &DAG) {
28864 SDVTList VTs = DAG.getVTList(MVT::Other);
28865 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
28866 SDValue Ops[] = { Chain, Val, Ptr, Undef };
28867 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
28868 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28869}
28870
28871/// Emit Masked Truncating Store with signed or unsigned saturation.
28872static SDValue
28873EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
28874 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
28875 MachineMemOperand *MMO, SelectionDAG &DAG) {
28876 SDVTList VTs = DAG.getVTList(MVT::Other);
28877 SDValue Ops[] = { Chain, Val, Ptr, Mask };
28878 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
28879 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28880}
28881
28882static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
28883 SelectionDAG &DAG) {
28884 unsigned IntNo = Op.getConstantOperandVal(1);
28885 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
28886 if (!IntrData) {
28887 switch (IntNo) {
28888
28889 case Intrinsic::swift_async_context_addr: {
28890 SDLoc dl(Op);
28891 auto &MF = DAG.getMachineFunction();
28892 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
28893 if (Subtarget.is64Bit()) {
28894 MF.getFrameInfo().setFrameAddressIsTaken(true);
28895 X86FI->setHasSwiftAsyncContext(true);
28896 SDValue Chain = Op->getOperand(0);
28897 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
28898 SDValue Result =
28899 SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
28900 DAG.getTargetConstant(8, dl, MVT::i32)),
28901 0);
28902 // Return { result, chain }.
28903 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28904 CopyRBP.getValue(1));
28905 } else {
28906 // 32-bit so no special extended frame, create or reuse an existing
28907 // stack slot.
28908 if (!X86FI->getSwiftAsyncContextFrameIdx())
28909 X86FI->setSwiftAsyncContextFrameIdx(
28910 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
28911 SDValue Result =
28912 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
28913 // Return { result, chain }.
28914 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28915 Op->getOperand(0));
28916 }
28917 }
28918
28919 case llvm::Intrinsic::x86_seh_ehregnode:
28920 return MarkEHRegistrationNode(Op, DAG);
28921 case llvm::Intrinsic::x86_seh_ehguard:
28922 return MarkEHGuard(Op, DAG);
28923 case llvm::Intrinsic::x86_rdpkru: {
28924 SDLoc dl(Op);
28925 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28926 // Create a RDPKRU node and pass 0 to the ECX parameter.
28927 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
28928 DAG.getConstant(0, dl, MVT::i32));
28929 }
28930 case llvm::Intrinsic::x86_wrpkru: {
28931 SDLoc dl(Op);
28932 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
28933 // to the EDX and ECX parameters.
28934 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
28935 Op.getOperand(0), Op.getOperand(2),
28936 DAG.getConstant(0, dl, MVT::i32),
28937 DAG.getConstant(0, dl, MVT::i32));
28938 }
28939 case llvm::Intrinsic::asan_check_memaccess: {
28940 // Mark this as adjustsStack because it will be lowered to a call.
28941 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
28942 // Don't do anything here, we will expand these intrinsics out later.
28943 return Op;
28944 }
28945 case llvm::Intrinsic::x86_flags_read_u32:
28946 case llvm::Intrinsic::x86_flags_read_u64:
28947 case llvm::Intrinsic::x86_flags_write_u32:
28948 case llvm::Intrinsic::x86_flags_write_u64: {
28949 // We need a frame pointer because this will get lowered to a PUSH/POP
28950 // sequence.
28951 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28952 MFI.setHasCopyImplyingStackAdjustment(true);
28953 // Don't do anything here, we will expand these intrinsics out later
28954 // during FinalizeISel in EmitInstrWithCustomInserter.
28955 return Op;
28956 }
28957 case Intrinsic::x86_lwpins32:
28958 case Intrinsic::x86_lwpins64:
28959 case Intrinsic::x86_umwait:
28960 case Intrinsic::x86_tpause: {
28961 SDLoc dl(Op);
28962 SDValue Chain = Op->getOperand(0);
28963 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28964 unsigned Opcode;
28965
28966 switch (IntNo) {
28967 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28967)
;
28968 case Intrinsic::x86_umwait:
28969 Opcode = X86ISD::UMWAIT;
28970 break;
28971 case Intrinsic::x86_tpause:
28972 Opcode = X86ISD::TPAUSE;
28973 break;
28974 case Intrinsic::x86_lwpins32:
28975 case Intrinsic::x86_lwpins64:
28976 Opcode = X86ISD::LWPINS;
28977 break;
28978 }
28979
28980 SDValue Operation =
28981 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
28982 Op->getOperand(3), Op->getOperand(4));
28983 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
28984 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28985 Operation.getValue(1));
28986 }
28987 case Intrinsic::x86_enqcmd:
28988 case Intrinsic::x86_enqcmds: {
28989 SDLoc dl(Op);
28990 SDValue Chain = Op.getOperand(0);
28991 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28992 unsigned Opcode;
28993 switch (IntNo) {
28994 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28994)
;
28995 case Intrinsic::x86_enqcmd:
28996 Opcode = X86ISD::ENQCMD;
28997 break;
28998 case Intrinsic::x86_enqcmds:
28999 Opcode = X86ISD::ENQCMDS;
29000 break;
29001 }
29002 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
29003 Op.getOperand(3));
29004 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
29005 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29006 Operation.getValue(1));
29007 }
29008 case Intrinsic::x86_aesenc128kl:
29009 case Intrinsic::x86_aesdec128kl:
29010 case Intrinsic::x86_aesenc256kl:
29011 case Intrinsic::x86_aesdec256kl: {
29012 SDLoc DL(Op);
29013 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
29014 SDValue Chain = Op.getOperand(0);
29015 unsigned Opcode;
29016
29017 switch (IntNo) {
29018 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29018)
;
29019 case Intrinsic::x86_aesenc128kl:
29020 Opcode = X86ISD::AESENC128KL;
29021 break;
29022 case Intrinsic::x86_aesdec128kl:
29023 Opcode = X86ISD::AESDEC128KL;
29024 break;
29025 case Intrinsic::x86_aesenc256kl:
29026 Opcode = X86ISD::AESENC256KL;
29027 break;
29028 case Intrinsic::x86_aesdec256kl:
29029 Opcode = X86ISD::AESDEC256KL;
29030 break;
29031 }
29032
29033 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
29034 MachineMemOperand *MMO = MemIntr->getMemOperand();
29035 EVT MemVT = MemIntr->getMemoryVT();
29036 SDValue Operation = DAG.getMemIntrinsicNode(
29037 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
29038 MMO);
29039 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
29040
29041 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29042 {ZF, Operation.getValue(0), Operation.getValue(2)});
29043 }
29044 case Intrinsic::x86_aesencwide128kl:
29045 case Intrinsic::x86_aesdecwide128kl:
29046 case Intrinsic::x86_aesencwide256kl:
29047 case Intrinsic::x86_aesdecwide256kl: {
29048 SDLoc DL(Op);
29049 SDVTList VTs = DAG.getVTList(
29050 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
29051 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
29052 SDValue Chain = Op.getOperand(0);
29053 unsigned Opcode;
29054
29055 switch (IntNo) {
29056 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29056)
;
29057 case Intrinsic::x86_aesencwide128kl:
29058 Opcode = X86ISD::AESENCWIDE128KL;
29059 break;
29060 case Intrinsic::x86_aesdecwide128kl:
29061 Opcode = X86ISD::AESDECWIDE128KL;
29062 break;
29063 case Intrinsic::x86_aesencwide256kl:
29064 Opcode = X86ISD::AESENCWIDE256KL;
29065 break;
29066 case Intrinsic::x86_aesdecwide256kl:
29067 Opcode = X86ISD::AESDECWIDE256KL;
29068 break;
29069 }
29070
29071 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
29072 MachineMemOperand *MMO = MemIntr->getMemOperand();
29073 EVT MemVT = MemIntr->getMemoryVT();
29074 SDValue Operation = DAG.getMemIntrinsicNode(
29075 Opcode, DL, VTs,
29076 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
29077 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
29078 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
29079 MemVT, MMO);
29080 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
29081
29082 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29083 {ZF, Operation.getValue(1), Operation.getValue(2),
29084 Operation.getValue(3), Operation.getValue(4),
29085 Operation.getValue(5), Operation.getValue(6),
29086 Operation.getValue(7), Operation.getValue(8),
29087 Operation.getValue(9)});
29088 }
29089 case Intrinsic::x86_testui: {
29090 SDLoc dl(Op);
29091 SDValue Chain = Op.getOperand(0);
29092 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
29093 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
29094 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
29095 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29096 Operation.getValue(1));
29097 }
29098 case Intrinsic::x86_atomic_bts_rm:
29099 case Intrinsic::x86_atomic_btc_rm:
29100 case Intrinsic::x86_atomic_btr_rm: {
29101 SDLoc DL(Op);
29102 MVT VT = Op.getSimpleValueType();
29103 SDValue Chain = Op.getOperand(0);
29104 SDValue Op1 = Op.getOperand(2);
29105 SDValue Op2 = Op.getOperand(3);
29106 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
29107 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
29108 : X86ISD::LBTR_RM;
29109 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29110 SDValue Res =
29111 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29112 {Chain, Op1, Op2}, VT, MMO);
29113 Chain = Res.getValue(1);
29114 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29115 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29116 }
29117 case Intrinsic::x86_atomic_bts:
29118 case Intrinsic::x86_atomic_btc:
29119 case Intrinsic::x86_atomic_btr: {
29120 SDLoc DL(Op);
29121 MVT VT = Op.getSimpleValueType();
29122 SDValue Chain = Op.getOperand(0);
29123 SDValue Op1 = Op.getOperand(2);
29124 SDValue Op2 = Op.getOperand(3);
29125 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
29126 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
29127 : X86ISD::LBTR;
29128 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
29129 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29130 SDValue Res =
29131 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29132 {Chain, Op1, Op2, Size}, VT, MMO);
29133 Chain = Res.getValue(1);
29134 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29135 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
29136 if (Imm)
29137 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
29138 DAG.getShiftAmountConstant(Imm, VT, DL));
29139 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29140 }
29141 case Intrinsic::x86_cmpccxadd32:
29142 case Intrinsic::x86_cmpccxadd64: {
29143 SDLoc DL(Op);
29144 SDValue Chain = Op.getOperand(0);
29145 SDValue Addr = Op.getOperand(2);
29146 SDValue Src1 = Op.getOperand(3);
29147 SDValue Src2 = Op.getOperand(4);
29148 SDValue CC = Op.getOperand(5);
29149 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29150 SDValue Operation = DAG.getMemIntrinsicNode(
29151 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
29152 MVT::i32, MMO);
29153 return Operation;
29154 }
29155 case Intrinsic::x86_aadd32:
29156 case Intrinsic::x86_aadd64:
29157 case Intrinsic::x86_aand32:
29158 case Intrinsic::x86_aand64:
29159 case Intrinsic::x86_aor32:
29160 case Intrinsic::x86_aor64:
29161 case Intrinsic::x86_axor32:
29162 case Intrinsic::x86_axor64: {
29163 SDLoc DL(Op);
29164 SDValue Chain = Op.getOperand(0);
29165 SDValue Op1 = Op.getOperand(2);
29166 SDValue Op2 = Op.getOperand(3);
29167 MVT VT = Op2.getSimpleValueType();
29168 unsigned Opc = 0;
29169 switch (IntNo) {
29170 default:
29171 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29171)
;
29172 case Intrinsic::x86_aadd32:
29173 case Intrinsic::x86_aadd64:
29174 Opc = X86ISD::AADD;
29175 break;
29176 case Intrinsic::x86_aand32:
29177 case Intrinsic::x86_aand64:
29178 Opc = X86ISD::AAND;
29179 break;
29180 case Intrinsic::x86_aor32:
29181 case Intrinsic::x86_aor64:
29182 Opc = X86ISD::AOR;
29183 break;
29184 case Intrinsic::x86_axor32:
29185 case Intrinsic::x86_axor64:
29186 Opc = X86ISD::AXOR;
29187 break;
29188 }
29189 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
29190 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
29191 {Chain, Op1, Op2}, VT, MMO);
29192 }
29193 case Intrinsic::x86_atomic_add_cc:
29194 case Intrinsic::x86_atomic_sub_cc:
29195 case Intrinsic::x86_atomic_or_cc:
29196 case Intrinsic::x86_atomic_and_cc:
29197 case Intrinsic::x86_atomic_xor_cc: {
29198 SDLoc DL(Op);
29199 SDValue Chain = Op.getOperand(0);
29200 SDValue Op1 = Op.getOperand(2);
29201 SDValue Op2 = Op.getOperand(3);
29202 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
29203 MVT VT = Op2.getSimpleValueType();
29204 unsigned Opc = 0;
29205 switch (IntNo) {
29206 default:
29207 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29207)
;
29208 case Intrinsic::x86_atomic_add_cc:
29209 Opc = X86ISD::LADD;
29210 break;
29211 case Intrinsic::x86_atomic_sub_cc:
29212 Opc = X86ISD::LSUB;
29213 break;
29214 case Intrinsic::x86_atomic_or_cc:
29215 Opc = X86ISD::LOR;
29216 break;
29217 case Intrinsic::x86_atomic_and_cc:
29218 Opc = X86ISD::LAND;
29219 break;
29220 case Intrinsic::x86_atomic_xor_cc:
29221 Opc = X86ISD::LXOR;
29222 break;
29223 }
29224 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29225 SDValue LockArith =
29226 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29227 {Chain, Op1, Op2}, VT, MMO);
29228 Chain = LockArith.getValue(1);
29229 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
29230 }
29231 }
29232 return SDValue();
29233 }
29234
29235 SDLoc dl(Op);
29236 switch(IntrData->Type) {
29237 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29237)
;
29238 case RDSEED:
29239 case RDRAND: {
29240 // Emit the node with the right value type.
29241 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
29242 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29243
29244 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
29245 // Otherwise return the value from Rand, which is always 0, casted to i32.
29246 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
29247 DAG.getConstant(1, dl, Op->getValueType(1)),
29248 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
29249 SDValue(Result.getNode(), 1)};
29250 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
29251
29252 // Return { result, isValid, chain }.
29253 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
29254 SDValue(Result.getNode(), 2));
29255 }
29256 case GATHER_AVX2: {
29257 SDValue Chain = Op.getOperand(0);
29258 SDValue Src = Op.getOperand(2);
29259 SDValue Base = Op.getOperand(3);
29260 SDValue Index = Op.getOperand(4);
29261 SDValue Mask = Op.getOperand(5);
29262 SDValue Scale = Op.getOperand(6);
29263 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29264 Scale, Chain, Subtarget);
29265 }
29266 case GATHER: {
29267 //gather(v1, mask, index, base, scale);
29268 SDValue Chain = Op.getOperand(0);
29269 SDValue Src = Op.getOperand(2);
29270 SDValue Base = Op.getOperand(3);
29271 SDValue Index = Op.getOperand(4);
29272 SDValue Mask = Op.getOperand(5);
29273 SDValue Scale = Op.getOperand(6);
29274 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
29275 Chain, Subtarget);
29276 }
29277 case SCATTER: {
29278 //scatter(base, mask, index, v1, scale);
29279 SDValue Chain = Op.getOperand(0);
29280 SDValue Base = Op.getOperand(2);
29281 SDValue Mask = Op.getOperand(3);
29282 SDValue Index = Op.getOperand(4);
29283 SDValue Src = Op.getOperand(5);
29284 SDValue Scale = Op.getOperand(6);
29285 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29286 Scale, Chain, Subtarget);
29287 }
29288 case PREFETCH: {
29289 const APInt &HintVal = Op.getConstantOperandAPInt(6);
29290 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__
__PRETTY_FUNCTION__))
29291 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__
__PRETTY_FUNCTION__))
;
29292 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
29293 SDValue Chain = Op.getOperand(0);
29294 SDValue Mask = Op.getOperand(2);
29295 SDValue Index = Op.getOperand(3);
29296 SDValue Base = Op.getOperand(4);
29297 SDValue Scale = Op.getOperand(5);
29298 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
29299 Subtarget);
29300 }
29301 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
29302 case RDTSC: {
29303 SmallVector<SDValue, 2> Results;
29304 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
29305 Results);
29306 return DAG.getMergeValues(Results, dl);
29307 }
29308 // Read Performance Monitoring Counters.
29309 case RDPMC:
29310 // Read Processor Register.
29311 case RDPRU:
29312 // GetExtended Control Register.
29313 case XGETBV: {
29314 SmallVector<SDValue, 2> Results;
29315
29316 // RDPMC uses ECX to select the index of the performance counter to read.
29317 // RDPRU uses ECX to select the processor register to read.
29318 // XGETBV uses ECX to select the index of the XCR register to return.
29319 // The result is stored into registers EDX:EAX.
29320 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
29321 Subtarget, Results);
29322 return DAG.getMergeValues(Results, dl);
29323 }
29324 // XTEST intrinsics.
29325 case XTEST: {
29326 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
29327 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29328
29329 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
29330 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
29331 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
29332 Ret, SDValue(InTrans.getNode(), 1));
29333 }
29334 case TRUNCATE_TO_MEM_VI8:
29335 case TRUNCATE_TO_MEM_VI16:
29336 case TRUNCATE_TO_MEM_VI32: {
29337 SDValue Mask = Op.getOperand(4);
29338 SDValue DataToTruncate = Op.getOperand(3);
29339 SDValue Addr = Op.getOperand(2);
29340 SDValue Chain = Op.getOperand(0);
29341
29342 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
29343 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29343, __extension__
__PRETTY_FUNCTION__))
;
29344
29345 EVT MemVT = MemIntr->getMemoryVT();
29346
29347 uint16_t TruncationOp = IntrData->Opc0;
29348 switch (TruncationOp) {
29349 case X86ISD::VTRUNC: {
29350 if (isAllOnesConstant(Mask)) // return just a truncate store
29351 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
29352 MemIntr->getMemOperand());
29353
29354 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29355 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29356 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
29357
29358 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
29359 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
29360 true /* truncating */);
29361 }
29362 case X86ISD::VTRUNCUS:
29363 case X86ISD::VTRUNCS: {
29364 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
29365 if (isAllOnesConstant(Mask))
29366 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
29367 MemIntr->getMemOperand(), DAG);
29368
29369 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29370 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29371
29372 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
29373 VMask, MemVT, MemIntr->getMemOperand(), DAG);
29374 }
29375 default:
29376 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29376)
;
29377 }
29378 }
29379 }
29380}
29381
29382SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
29383 SelectionDAG &DAG) const {
29384 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
29385 MFI.setReturnAddressIsTaken(true);
29386
29387 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
29388 return SDValue();
29389
29390 unsigned Depth = Op.getConstantOperandVal(0);
29391 SDLoc dl(Op);
29392 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29393
29394 if (Depth > 0) {
29395 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
29396 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29397 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
29398 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
29399 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
29400 MachinePointerInfo());
29401 }
29402
29403 // Just load the return address.
29404 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
29405 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
29406 MachinePointerInfo());
29407}
29408
29409SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
29410 SelectionDAG &DAG) const {
29411 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
29412 return getReturnAddressFrameIndex(DAG);
29413}
29414
29415SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
29416 MachineFunction &MF = DAG.getMachineFunction();
29417 MachineFrameInfo &MFI = MF.getFrameInfo();
29418 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
29419 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29420 EVT VT = Op.getValueType();
29421
29422 MFI.setFrameAddressIsTaken(true);
29423
29424 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
29425 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
29426 // is not possible to crawl up the stack without looking at the unwind codes
29427 // simultaneously.
29428 int FrameAddrIndex = FuncInfo->getFAIndex();
29429 if (!FrameAddrIndex) {
29430 // Set up a frame object for the return address.
29431 unsigned SlotSize = RegInfo->getSlotSize();
29432 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
29433 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
29434 FuncInfo->setFAIndex(FrameAddrIndex);
29435 }
29436 return DAG.getFrameIndex(FrameAddrIndex, VT);
29437 }
29438
29439 unsigned FrameReg =
29440 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
29441 SDLoc dl(Op); // FIXME probably not meaningful
29442 unsigned Depth = Op.getConstantOperandVal(0);
29443 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))
29444 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))
29445 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))
;
29446 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
29447 while (Depth--)
29448 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
29449 MachinePointerInfo());
29450 return FrameAddr;
29451}
29452
29453// FIXME? Maybe this could be a TableGen attribute on some registers and
29454// this table could be generated automatically from RegInfo.
29455Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
29456 const MachineFunction &MF) const {
29457 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
29458
29459 Register Reg = StringSwitch<unsigned>(RegName)
29460 .Case("esp", X86::ESP)
29461 .Case("rsp", X86::RSP)
29462 .Case("ebp", X86::EBP)
29463 .Case("rbp", X86::RBP)
29464 .Default(0);
29465
29466 if (Reg == X86::EBP || Reg == X86::RBP) {
29467 if (!TFI.hasFP(MF))
29468 report_fatal_error("register " + StringRef(RegName) +
29469 " is allocatable: function has no frame pointer");
29470#ifndef NDEBUG
29471 else {
29472 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29473 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
29474 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__
__PRETTY_FUNCTION__))
29475 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__
__PRETTY_FUNCTION__))
;
29476 }
29477#endif
29478 }
29479
29480 if (Reg)
29481 return Reg;
29482
29483 report_fatal_error("Invalid register name global variable");
29484}
29485
29486SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
29487 SelectionDAG &DAG) const {
29488 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29489 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
29490}
29491
29492Register X86TargetLowering::getExceptionPointerRegister(
29493 const Constant *PersonalityFn) const {
29494 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
29495 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29496
29497 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
29498}
29499
29500Register X86TargetLowering::getExceptionSelectorRegister(
29501 const Constant *PersonalityFn) const {
29502 // Funclet personalities don't use selectors (the runtime does the selection).
29503 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
29504 return X86::NoRegister;
29505 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29506}
29507
29508bool X86TargetLowering::needsFixedCatchObjects() const {
29509 return Subtarget.isTargetWin64();
29510}
29511
29512SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
29513 SDValue Chain = Op.getOperand(0);
29514 SDValue Offset = Op.getOperand(1);
29515 SDValue Handler = Op.getOperand(2);
29516 SDLoc dl (Op);
29517
29518 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29519 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29520 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
29521 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))
29522 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))
29523 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))
;
29524 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
29525 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
29526
29527 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
29528 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
29529 dl));
29530 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
29531 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
29532 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
29533
29534 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
29535 DAG.getRegister(StoreAddrReg, PtrVT));
29536}
29537
29538SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
29539 SelectionDAG &DAG) const {
29540 SDLoc DL(Op);
29541 // If the subtarget is not 64bit, we may need the global base reg
29542 // after isel expand pseudo, i.e., after CGBR pass ran.
29543 // Therefore, ask for the GlobalBaseReg now, so that the pass
29544 // inserts the code for us in case we need it.
29545 // Otherwise, we will end up in a situation where we will
29546 // reference a virtual register that is not defined!
29547 if (!Subtarget.is64Bit()) {
29548 const X86InstrInfo *TII = Subtarget.getInstrInfo();
29549 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
29550 }
29551 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
29552 DAG.getVTList(MVT::i32, MVT::Other),
29553 Op.getOperand(0), Op.getOperand(1));
29554}
29555
29556SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
29557 SelectionDAG &DAG) const {
29558 SDLoc DL(Op);
29559 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
29560 Op.getOperand(0), Op.getOperand(1));
29561}
29562
29563SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
29564 SelectionDAG &DAG) const {
29565 SDLoc DL(Op);
29566 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
29567 Op.getOperand(0));
29568}
29569
29570static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
29571 return Op.getOperand(0);
29572}
29573
29574SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
29575 SelectionDAG &DAG) const {
29576 SDValue Root = Op.getOperand(0);
29577 SDValue Trmp = Op.getOperand(1); // trampoline
29578 SDValue FPtr = Op.getOperand(2); // nested function
29579 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
29580 SDLoc dl (Op);
29581
29582 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
29583 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29584
29585 if (Subtarget.is64Bit()) {
29586 SDValue OutChains[6];
29587
29588 // Large code-model.
29589 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
29590 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
29591
29592 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
29593 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
29594
29595 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
29596
29597 // Load the pointer to the nested function into R11.
29598 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
29599 SDValue Addr = Trmp;
29600 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29601 Addr, MachinePointerInfo(TrmpAddr));
29602
29603 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29604 DAG.getConstant(2, dl, MVT::i64));
29605 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
29606 MachinePointerInfo(TrmpAddr, 2), Align(2));
29607
29608 // Load the 'nest' parameter value into R10.
29609 // R10 is specified in X86CallingConv.td
29610 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
29611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29612 DAG.getConstant(10, dl, MVT::i64));
29613 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29614 Addr, MachinePointerInfo(TrmpAddr, 10));
29615
29616 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29617 DAG.getConstant(12, dl, MVT::i64));
29618 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
29619 MachinePointerInfo(TrmpAddr, 12), Align(2));
29620
29621 // Jump to the nested function.
29622 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
29623 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29624 DAG.getConstant(20, dl, MVT::i64));
29625 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29626 Addr, MachinePointerInfo(TrmpAddr, 20));
29627
29628 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
29629 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29630 DAG.getConstant(22, dl, MVT::i64));
29631 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
29632 Addr, MachinePointerInfo(TrmpAddr, 22));
29633
29634 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29635 } else {
29636 const Function *Func =
29637 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
29638 CallingConv::ID CC = Func->getCallingConv();
29639 unsigned NestReg;
29640
29641 switch (CC) {
29642 default:
29643 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29643)
;
29644 case CallingConv::C:
29645 case CallingConv::X86_StdCall: {
29646 // Pass 'nest' parameter in ECX.
29647 // Must be kept in sync with X86CallingConv.td
29648 NestReg = X86::ECX;
29649
29650 // Check that ECX wasn't needed by an 'inreg' parameter.
29651 FunctionType *FTy = Func->getFunctionType();
29652 const AttributeList &Attrs = Func->getAttributes();
29653
29654 if (!Attrs.isEmpty() && !Func->isVarArg()) {
29655 unsigned InRegCount = 0;
29656 unsigned Idx = 0;
29657
29658 for (FunctionType::param_iterator I = FTy->param_begin(),
29659 E = FTy->param_end(); I != E; ++I, ++Idx)
29660 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
29661 const DataLayout &DL = DAG.getDataLayout();
29662 // FIXME: should only count parameters that are lowered to integers.
29663 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
29664 }
29665
29666 if (InRegCount > 2) {
29667 report_fatal_error("Nest register in use - reduce number of inreg"
29668 " parameters!");
29669 }
29670 }
29671 break;
29672 }
29673 case CallingConv::X86_FastCall:
29674 case CallingConv::X86_ThisCall:
29675 case CallingConv::Fast:
29676 case CallingConv::Tail:
29677 case CallingConv::SwiftTail:
29678 // Pass 'nest' parameter in EAX.
29679 // Must be kept in sync with X86CallingConv.td
29680 NestReg = X86::EAX;
29681 break;
29682 }
29683
29684 SDValue OutChains[4];
29685 SDValue Addr, Disp;
29686
29687 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29688 DAG.getConstant(10, dl, MVT::i32));
29689 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
29690
29691 // This is storing the opcode for MOV32ri.
29692 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
29693 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
29694 OutChains[0] =
29695 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
29696 Trmp, MachinePointerInfo(TrmpAddr));
29697
29698 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29699 DAG.getConstant(1, dl, MVT::i32));
29700 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
29701 MachinePointerInfo(TrmpAddr, 1), Align(1));
29702
29703 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
29704 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29705 DAG.getConstant(5, dl, MVT::i32));
29706 OutChains[2] =
29707 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
29708 MachinePointerInfo(TrmpAddr, 5), Align(1));
29709
29710 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29711 DAG.getConstant(6, dl, MVT::i32));
29712 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
29713 MachinePointerInfo(TrmpAddr, 6), Align(1));
29714
29715 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29716 }
29717}
29718
29719SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
29720 SelectionDAG &DAG) const {
29721 /*
29722 The rounding mode is in bits 11:10 of FPSR, and has the following
29723 settings:
29724 00 Round to nearest
29725 01 Round to -inf
29726 10 Round to +inf
29727 11 Round to 0
29728
29729 GET_ROUNDING, on the other hand, expects the following:
29730 -1 Undefined
29731 0 Round to 0
29732 1 Round to nearest
29733 2 Round to +inf
29734 3 Round to -inf
29735
29736 To perform the conversion, we use a packed lookup table of the four 2-bit
29737 values that we can index by FPSP[11:10]
29738 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
29739
29740 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
29741 */
29742
29743 MachineFunction &MF = DAG.getMachineFunction();
29744 MVT VT = Op.getSimpleValueType();
29745 SDLoc DL(Op);
29746
29747 // Save FP Control Word to stack slot
29748 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
29749 SDValue StackSlot =
29750 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
29751
29752 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
29753
29754 SDValue Chain = Op.getOperand(0);
29755 SDValue Ops[] = {Chain, StackSlot};
29756 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
29757 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
29758 Align(2), MachineMemOperand::MOStore);
29759
29760 // Load FP Control Word from stack slot
29761 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
29762 Chain = CWD.getValue(1);
29763
29764 // Mask and turn the control bits into a shift for the lookup table.
29765 SDValue Shift =
29766 DAG.getNode(ISD::SRL, DL, MVT::i16,
29767 DAG.getNode(ISD::AND, DL, MVT::i16,
29768 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
29769 DAG.getConstant(9, DL, MVT::i8));
29770 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
29771
29772 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
29773 SDValue RetVal =
29774 DAG.getNode(ISD::AND, DL, MVT::i32,
29775 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
29776 DAG.getConstant(3, DL, MVT::i32));
29777
29778 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
29779
29780 return DAG.getMergeValues({RetVal, Chain}, DL);
29781}
29782
29783SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
29784 SelectionDAG &DAG) const {
29785 MachineFunction &MF = DAG.getMachineFunction();
29786 SDLoc DL(Op);
29787 SDValue Chain = Op.getNode()->getOperand(0);
29788
29789 // FP control word may be set only from data in memory. So we need to allocate
29790 // stack space to save/load FP control word.
29791 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
29792 SDValue StackSlot =
29793 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
29794 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
29795 MachineMemOperand *MMO =
29796 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
29797
29798 // Store FP control word into memory.
29799 SDValue Ops[] = {Chain, StackSlot};
29800 Chain = DAG.getMemIntrinsicNode(
29801 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
29802
29803 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
29804 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
29805 Chain = CWD.getValue(1);
29806 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
29807 DAG.getConstant(0xf3ff, DL, MVT::i16));
29808
29809 // Calculate new rounding mode.
29810 SDValue NewRM = Op.getNode()->getOperand(1);
29811 SDValue RMBits;
29812 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
29813 uint64_t RM = CVal->getZExtValue();
29814 int FieldVal;
29815 switch (static_cast<RoundingMode>(RM)) {
29816 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
29817 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
29818 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
29819 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
29820 default:
29821 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29821)
;
29822 }
29823 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
29824 } else {
29825 // Need to convert argument into bits of control word:
29826 // 0 Round to 0 -> 11
29827 // 1 Round to nearest -> 00
29828 // 2 Round to +inf -> 10
29829 // 3 Round to -inf -> 01
29830 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
29831 // To make the conversion, put all these values into a value 0xc9 and shift
29832 // it left depending on the rounding mode:
29833 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
29834 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
29835 // ...
29836 // (0xc9 << (2 * NewRM + 4)) & 0xc00
29837 SDValue ShiftValue =
29838 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
29839 DAG.getNode(ISD::ADD, DL, MVT::i32,
29840 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
29841 DAG.getConstant(1, DL, MVT::i8)),
29842 DAG.getConstant(4, DL, MVT::i32)));
29843 SDValue Shifted =
29844 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
29845 ShiftValue);
29846 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
29847 DAG.getConstant(0xc00, DL, MVT::i16));
29848 }
29849
29850 // Update rounding mode bits and store the new FP Control Word into stack.
29851 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
29852 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
29853
29854 // Load FP control word from the slot.
29855 SDValue OpsLD[] = {Chain, StackSlot};
29856 MachineMemOperand *MMOL =
29857 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
29858 Chain = DAG.getMemIntrinsicNode(
29859 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
29860
29861 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
29862 // same way but in bits 14:13.
29863 if (Subtarget.hasSSE1()) {
29864 // Store MXCSR into memory.
29865 Chain = DAG.getNode(
29866 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29867 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
29868 StackSlot);
29869
29870 // Load MXCSR from stack slot and clear RM field (bits 14:13).
29871 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
29872 Chain = CWD.getValue(1);
29873 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
29874 DAG.getConstant(0xffff9fff, DL, MVT::i32));
29875
29876 // Shift X87 RM bits from 11:10 to 14:13.
29877 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
29878 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
29879 DAG.getConstant(3, DL, MVT::i8));
29880
29881 // Update rounding mode bits and store the new FP Control Word into stack.
29882 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
29883 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
29884
29885 // Load MXCSR from the slot.
29886 Chain = DAG.getNode(
29887 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29888 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
29889 StackSlot);
29890 }
29891
29892 return Chain;
29893}
29894
29895/// Lower a vector CTLZ using native supported vector CTLZ instruction.
29896//
29897// i8/i16 vector implemented using dword LZCNT vector instruction
29898// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
29899// split the vector, perform operation on it's Lo a Hi part and
29900// concatenate the results.
29901static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
29902 const X86Subtarget &Subtarget) {
29903 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29903, __extension__ __PRETTY_FUNCTION__))
;
29904 SDLoc dl(Op);
29905 MVT VT = Op.getSimpleValueType();
29906 MVT EltVT = VT.getVectorElementType();
29907 unsigned NumElems = VT.getVectorNumElements();
29908
29909 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__
__PRETTY_FUNCTION__))
29910 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__
__PRETTY_FUNCTION__))
;
29911
29912 // Split vector, it's Lo and Hi parts will be handled in next iteration.
29913 if (NumElems > 16 ||
29914 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
29915 return splitVectorIntUnary(Op, DAG);
29916
29917 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29918 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__
__PRETTY_FUNCTION__))
29919 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__
__PRETTY_FUNCTION__))
;
29920
29921 // Use native supported vector instruction vplzcntd.
29922 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
29923 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
29924 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
29925 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
29926
29927 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
29928}
29929
29930// Lower CTLZ using a PSHUFB lookup table implementation.
29931static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
29932 const X86Subtarget &Subtarget,
29933 SelectionDAG &DAG) {
29934 MVT VT = Op.getSimpleValueType();
29935 int NumElts = VT.getVectorNumElements();
29936 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
29937 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
29938
29939 // Per-nibble leading zero PSHUFB lookup table.
29940 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
29941 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
29942 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
29943 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
29944
29945 SmallVector<SDValue, 64> LUTVec;
29946 for (int i = 0; i < NumBytes; ++i)
29947 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29948 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29949
29950 // Begin by bitcasting the input to byte vector, then split those bytes
29951 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
29952 // If the hi input nibble is zero then we add both results together, otherwise
29953 // we just take the hi result (by masking the lo result to zero before the
29954 // add).
29955 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29956 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29957
29958 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29959 SDValue Lo = Op0;
29960 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29961 SDValue HiZ;
29962 if (CurrVT.is512BitVector()) {
29963 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29964 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29965 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29966 } else {
29967 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29968 }
29969
29970 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29971 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29972 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29973 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29974
29975 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29976 // of the current vector width in the same way we did for the nibbles.
29977 // If the upper half of the input element is zero then add the halves'
29978 // leading zero counts together, otherwise just use the upper half's.
29979 // Double the width of the result until we are at target width.
29980 while (CurrVT != VT) {
29981 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29982 int CurrNumElts = CurrVT.getVectorNumElements();
29983 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29984 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29985 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29986
29987 // Check if the upper half of the input element is zero.
29988 if (CurrVT.is512BitVector()) {
29989 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29990 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29991 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29992 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29993 } else {
29994 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29995 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29996 }
29997 HiZ = DAG.getBitcast(NextVT, HiZ);
29998
29999 // Move the upper/lower halves to the lower bits as we'll be extending to
30000 // NextVT. Mask the lower result to zero if HiZ is true and add the results
30001 // together.
30002 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
30003 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
30004 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
30005 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
30006 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
30007 CurrVT = NextVT;
30008 }
30009
30010 return Res;
30011}
30012
30013static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
30014 const X86Subtarget &Subtarget,
30015 SelectionDAG &DAG) {
30016 MVT VT = Op.getSimpleValueType();
30017
30018 if (Subtarget.hasCDI() &&
30019 // vXi8 vectors need to be promoted to 512-bits for vXi32.
30020 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
30021 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
30022
30023 // Decompose 256-bit ops into smaller 128-bit ops.
30024 if (VT.is256BitVector() && !Subtarget.hasInt256())
30025 return splitVectorIntUnary(Op, DAG);
30026
30027 // Decompose 512-bit ops into smaller 256-bit ops.
30028 if (VT.is512BitVector() && !Subtarget.hasBWI())
30029 return splitVectorIntUnary(Op, DAG);
30030
30031 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30031, __extension__
__PRETTY_FUNCTION__))
;
30032 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
30033}
30034
30035static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
30036 SelectionDAG &DAG) {
30037 MVT VT = Op.getSimpleValueType();
30038 MVT OpVT = VT;
30039 unsigned NumBits = VT.getSizeInBits();
30040 SDLoc dl(Op);
30041 unsigned Opc = Op.getOpcode();
30042
30043 if (VT.isVector())
30044 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
30045
30046 Op = Op.getOperand(0);
30047 if (VT == MVT::i8) {
30048 // Zero extend to i32 since there is not an i8 bsr.
30049 OpVT = MVT::i32;
30050 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
30051 }
30052
30053 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
30054 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
30055 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
30056
30057 if (Opc == ISD::CTLZ) {
30058 // If src is zero (i.e. bsr sets ZF), returns NumBits.
30059 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
30060 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
30061 Op.getValue(1)};
30062 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
30063 }
30064
30065 // Finally xor with NumBits-1.
30066 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
30067 DAG.getConstant(NumBits - 1, dl, OpVT));
30068
30069 if (VT == MVT::i8)
30070 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
30071 return Op;
30072}
30073
30074static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
30075 SelectionDAG &DAG) {
30076 MVT VT = Op.getSimpleValueType();
30077 unsigned NumBits = VT.getScalarSizeInBits();
30078 SDValue N0 = Op.getOperand(0);
30079 SDLoc dl(Op);
30080
30081 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__
__PRETTY_FUNCTION__))
30082 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__
__PRETTY_FUNCTION__))
;
30083
30084 // Issue a bsf (scan bits forward) which also sets EFLAGS.
30085 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30086 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
30087
30088 // If src is known never zero we can skip the CMOV.
30089 if (DAG.isKnownNeverZero(N0))
30090 return Op;
30091
30092 // If src is zero (i.e. bsf sets ZF), returns NumBits.
30093 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
30094 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
30095 Op.getValue(1)};
30096 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
30097}
30098
30099static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
30100 const X86Subtarget &Subtarget) {
30101 MVT VT = Op.getSimpleValueType();
30102 if (VT == MVT::i16 || VT == MVT::i32)
30103 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
30104
30105 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30106 return splitVectorIntBinary(Op, DAG);
30107
30108 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))
30109 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))
30110 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))
;
30111 return splitVectorIntBinary(Op, DAG);
30112}
30113
30114static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
30115 const X86Subtarget &Subtarget) {
30116 MVT VT = Op.getSimpleValueType();
30117 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
30118 unsigned Opcode = Op.getOpcode();
30119 SDLoc DL(Op);
30120
30121 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
30122 (VT.is256BitVector() && !Subtarget.hasInt256())) {
30123 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__
__PRETTY_FUNCTION__))
30124 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__
__PRETTY_FUNCTION__))
;
30125 return splitVectorIntBinary(Op, DAG);
30126 }
30127
30128 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
30129 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30130 EVT SetCCResultType =
30131 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30132
30133 unsigned BitWidth = VT.getScalarSizeInBits();
30134 if (Opcode == ISD::USUBSAT) {
30135 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
30136 // Handle a special-case with a bit-hack instead of cmp+select:
30137 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
30138 // If the target can use VPTERNLOG, DAGToDAG will match this as
30139 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
30140 // "broadcast" constant load.
30141 ConstantSDNode *C = isConstOrConstSplat(Y, true);
30142 if (C && C->getAPIntValue().isSignMask()) {
30143 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
30144 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
30145 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
30146 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
30147 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
30148 }
30149 }
30150 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
30151 // usubsat X, Y --> (X >u Y) ? X - Y : 0
30152 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
30153 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
30154 // TODO: Move this to DAGCombiner?
30155 if (SetCCResultType == VT &&
30156 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
30157 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
30158 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
30159 }
30160 }
30161
30162 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
30163 (!VT.isVector() || VT == MVT::v2i64)) {
30164 APInt MinVal = APInt::getSignedMinValue(BitWidth);
30165 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
30166 SDValue Zero = DAG.getConstant(0, DL, VT);
30167 SDValue Result =
30168 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
30169 DAG.getVTList(VT, SetCCResultType), X, Y);
30170 SDValue SumDiff = Result.getValue(0);
30171 SDValue Overflow = Result.getValue(1);
30172 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
30173 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
30174 SDValue SumNeg =
30175 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
30176 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
30177 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
30178 }
30179
30180 // Use default expansion.
30181 return SDValue();
30182}
30183
30184static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
30185 SelectionDAG &DAG) {
30186 MVT VT = Op.getSimpleValueType();
30187 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
30188 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30189 // 8-bit integer abs to NEG and CMOV.
30190 SDLoc DL(Op);
30191 SDValue N0 = Op.getOperand(0);
30192 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30193 DAG.getConstant(0, DL, VT), N0);
30194 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
30195 SDValue(Neg.getNode(), 1)};
30196 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
30197 }
30198
30199 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
30200 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
30201 SDLoc DL(Op);
30202 SDValue Src = Op.getOperand(0);
30203 SDValue Sub =
30204 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
30205 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
30206 }
30207
30208 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
30209 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__
__PRETTY_FUNCTION__))
30210 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__
__PRETTY_FUNCTION__))
;
30211 return splitVectorIntUnary(Op, DAG);
30212 }
30213
30214 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30215 return splitVectorIntUnary(Op, DAG);
30216
30217 // Default to expand.
30218 return SDValue();
30219}
30220
30221static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
30222 SelectionDAG &DAG) {
30223 MVT VT = Op.getSimpleValueType();
30224
30225 // For AVX1 cases, split to use legal ops.
30226 if (VT.is256BitVector() && !Subtarget.hasInt256())
30227 return splitVectorIntBinary(Op, DAG);
30228
30229 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30230 return splitVectorIntBinary(Op, DAG);
30231
30232 // Default to expand.
30233 return SDValue();
30234}
30235
30236static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
30237 SelectionDAG &DAG) {
30238 MVT VT = Op.getSimpleValueType();
30239
30240 // For AVX1 cases, split to use legal ops.
30241 if (VT.is256BitVector() && !Subtarget.hasInt256())
30242 return splitVectorIntBinary(Op, DAG);
30243
30244 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30245 return splitVectorIntBinary(Op, DAG);
30246
30247 // Default to expand.
30248 return SDValue();
30249}
30250
30251static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
30252 SelectionDAG &DAG) {
30253 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM ||
Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__
__PRETTY_FUNCTION__))
30254 "Expected FMAXIMUM or FMINIMUM opcode")(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM ||
Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__
__PRETTY_FUNCTION__))
;
30255 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30256 EVT VT = Op.getValueType();
30257 SDValue X = Op.getOperand(0);
30258 SDValue Y = Op.getOperand(1);
30259 SDLoc DL(Op);
30260 uint64_t SizeInBits = VT.getFixedSizeInBits();
30261 APInt PreferredZero = APInt::getZero(SizeInBits);
30262 EVT IVT = MVT::getIntegerVT(SizeInBits);
30263 X86ISD::NodeType MinMaxOp;
30264 if (Op.getOpcode() == ISD::FMAXIMUM) {
30265 MinMaxOp = X86ISD::FMAX;
30266 } else {
30267 PreferredZero.setSignBit();
30268 MinMaxOp = X86ISD::FMIN;
30269 }
30270 EVT SetCCType =
30271 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30272
30273 // The tables below show the expected result of Max in cases of NaN and
30274 // signed zeros.
30275 //
30276 // Y Y
30277 // Num xNaN +0 -0
30278 // --------------- ---------------
30279 // Num | Max | Y | +0 | +0 | +0 |
30280 // X --------------- X ---------------
30281 // xNaN | X | X/Y | -0 | +0 | -0 |
30282 // --------------- ---------------
30283 //
30284 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
30285 // reordering.
30286 //
30287 // We check if any of operands is NaN and return NaN. Then we check if any of
30288 // operands is zero or negative zero (for fmaximum and fminimum respectively)
30289 // to ensure the correct zero is returned.
30290 auto IsPreferredZero = [PreferredZero](SDValue Op) {
30291 Op = peekThroughBitcasts(Op);
30292 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
30293 return CstOp->getValueAPF().bitcastToAPInt() == PreferredZero;
30294 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
30295 return CstOp->getAPIntValue() == PreferredZero;
30296 return false;
30297 };
30298
30299 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
30300 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
30301 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
30302 Op->getFlags().hasNoSignedZeros() ||
30303 DAG.isKnownNeverZeroFloat(X) ||
30304 DAG.isKnownNeverZeroFloat(Y);
30305 SDValue NewX, NewY;
30306 if (IgnoreSignedZero || IsPreferredZero(Y)) {
30307 // Operands are already in right order or order does not matter.
30308 NewX = X;
30309 NewY = Y;
30310 } else if (IsPreferredZero(X)) {
30311 NewX = Y;
30312 NewY = X;
30313 } else if ((VT == MVT::f16 || Subtarget.hasDQI()) &&
30314 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
30315 if (IsXNeverNaN)
30316 std::swap(X, Y);
30317 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
30318 // xmm register.
30319 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
30320 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X);
30321 // Bits of classes:
30322 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
30323 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
30324 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
30325 DL, MVT::i32);
30326 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
30327 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
30328 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
30329 DAG.getIntPtrConstant(0, DL));
30330 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
30331 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
30332 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
30333 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
30334 } else {
30335 SDValue IsXSigned;
30336 if (Subtarget.is64Bit() || VT != MVT::f64) {
30337 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
30338 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
30339 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
30340 } else {
30341 assert(VT == MVT::f64)(static_cast <bool> (VT == MVT::f64) ? void (0) : __assert_fail
("VT == MVT::f64", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30341, __extension__ __PRETTY_FUNCTION__))
;
30342 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
30343 DAG.getConstantFP(0, DL, MVT::v2f64), X,
30344 DAG.getIntPtrConstant(0, DL));
30345 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
30346 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
30347 DAG.getIntPtrConstant(1, DL));
30348 Hi = DAG.getBitcast(MVT::i32, Hi);
30349 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
30350 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
30351 *DAG.getContext(), MVT::i32);
30352 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
30353 }
30354 if (MinMaxOp == X86ISD::FMAX) {
30355 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
30356 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
30357 } else {
30358 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
30359 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
30360 }
30361 }
30362
30363 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
30364 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
30365
30366 // If we did no ordering operands for singed zero handling and we need
30367 // to process NaN and we know that the second operand is not NaN then put
30368 // it in first operand and we will not need to post handle NaN after max/min.
30369 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
30370 std::swap(NewX, NewY);
30371
30372 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
30373
30374 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
30375 return MinMax;
30376
30377 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
30378 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
30379}
30380
30381static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
30382 SelectionDAG &DAG) {
30383 MVT VT = Op.getSimpleValueType();
30384
30385 // For AVX1 cases, split to use legal ops.
30386 if (VT.is256BitVector() && !Subtarget.hasInt256())
30387 return splitVectorIntBinary(Op, DAG);
30388
30389 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
30390 return splitVectorIntBinary(Op, DAG);
30391
30392 SDLoc dl(Op);
30393 bool IsSigned = Op.getOpcode() == ISD::ABDS;
30394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30395
30396 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
30397 if (VT.isScalarInteger()) {
30398 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
30399 MVT WideVT = MVT::getIntegerVT(WideBits);
30400 if (TLI.isTypeLegal(WideVT)) {
30401 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
30402 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
30403 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30404 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
30405 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
30406 LHS = DAG.getNode(ExtOpc, dl, WideVT, LHS);
30407 RHS = DAG.getNode(ExtOpc, dl, WideVT, RHS);
30408 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
30409 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
30410 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
30411 }
30412 }
30413
30414 // Default to expand.
30415 return SDValue();
30416}
30417
30418static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
30419 SelectionDAG &DAG) {
30420 SDLoc dl(Op);
30421 MVT VT = Op.getSimpleValueType();
30422
30423 // Decompose 256-bit ops into 128-bit ops.
30424 if (VT.is256BitVector() && !Subtarget.hasInt256())
30425 return splitVectorIntBinary(Op, DAG);
30426
30427 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30428 return splitVectorIntBinary(Op, DAG);
30429
30430 SDValue A = Op.getOperand(0);
30431 SDValue B = Op.getOperand(1);
30432
30433 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
30434 // vector pairs, multiply and truncate.
30435 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
30436 unsigned NumElts = VT.getVectorNumElements();
30437
30438 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30439 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30440 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30441 return DAG.getNode(
30442 ISD::TRUNCATE, dl, VT,
30443 DAG.getNode(ISD::MUL, dl, ExVT,
30444 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
30445 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
30446 }
30447
30448 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30449
30450 // Extract the lo/hi parts to any extend to i16.
30451 // We're going to mask off the low byte of each result element of the
30452 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
30453 // element.
30454 SDValue Undef = DAG.getUNDEF(VT);
30455 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
30456 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
30457
30458 SDValue BLo, BHi;
30459 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30460 // If the RHS is a constant, manually unpackl/unpackh.
30461 SmallVector<SDValue, 16> LoOps, HiOps;
30462 for (unsigned i = 0; i != NumElts; i += 16) {
30463 for (unsigned j = 0; j != 8; ++j) {
30464 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
30465 MVT::i16));
30466 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
30467 MVT::i16));
30468 }
30469 }
30470
30471 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30472 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30473 } else {
30474 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
30475 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
30476 }
30477
30478 // Multiply, mask the lower 8bits of the lo/hi results and pack.
30479 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
30480 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
30481 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30482 }
30483
30484 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
30485 if (VT == MVT::v4i32) {
30486 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__
__PRETTY_FUNCTION__))
30487 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__
__PRETTY_FUNCTION__))
;
30488
30489 // Extract the odd parts.
30490 static const int UnpackMask[] = { 1, -1, 3, -1 };
30491 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
30492 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
30493
30494 // Multiply the even parts.
30495 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30496 DAG.getBitcast(MVT::v2i64, A),
30497 DAG.getBitcast(MVT::v2i64, B));
30498 // Now multiply odd parts.
30499 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30500 DAG.getBitcast(MVT::v2i64, Aodds),
30501 DAG.getBitcast(MVT::v2i64, Bodds));
30502
30503 Evens = DAG.getBitcast(VT, Evens);
30504 Odds = DAG.getBitcast(VT, Odds);
30505
30506 // Merge the two vectors back together with a shuffle. This expands into 2
30507 // shuffles.
30508 static const int ShufMask[] = { 0, 4, 2, 6 };
30509 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
30510 }
30511
30512 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__))
30513 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__))
;
30514 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30514, __extension__
__PRETTY_FUNCTION__))
;
30515
30516 // Ahi = psrlqi(a, 32);
30517 // Bhi = psrlqi(b, 32);
30518 //
30519 // AloBlo = pmuludq(a, b);
30520 // AloBhi = pmuludq(a, Bhi);
30521 // AhiBlo = pmuludq(Ahi, b);
30522 //
30523 // Hi = psllqi(AloBhi + AhiBlo, 32);
30524 // return AloBlo + Hi;
30525 KnownBits AKnown = DAG.computeKnownBits(A);
30526 KnownBits BKnown = DAG.computeKnownBits(B);
30527
30528 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
30529 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
30530 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
30531
30532 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
30533 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
30534 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
30535
30536 SDValue Zero = DAG.getConstant(0, dl, VT);
30537
30538 // Only multiply lo/hi halves that aren't known to be zero.
30539 SDValue AloBlo = Zero;
30540 if (!ALoIsZero && !BLoIsZero)
30541 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
30542
30543 SDValue AloBhi = Zero;
30544 if (!ALoIsZero && !BHiIsZero) {
30545 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
30546 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
30547 }
30548
30549 SDValue AhiBlo = Zero;
30550 if (!AHiIsZero && !BLoIsZero) {
30551 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
30552 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
30553 }
30554
30555 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
30556 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
30557
30558 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
30559}
30560
30561static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
30562 MVT VT, bool IsSigned,
30563 const X86Subtarget &Subtarget,
30564 SelectionDAG &DAG,
30565 SDValue *Low = nullptr) {
30566 unsigned NumElts = VT.getVectorNumElements();
30567
30568 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
30569 // to a vXi16 type. Do the multiplies, shift the results and pack the half
30570 // lane results back together.
30571
30572 // We'll take different approaches for signed and unsigned.
30573 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
30574 // and use pmullw to calculate the full 16-bit product.
30575 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
30576 // shift them left into the upper byte of each word. This allows us to use
30577 // pmulhw to calculate the full 16-bit product. This trick means we don't
30578 // need to sign extend the bytes to use pmullw.
30579
30580 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30581 SDValue Zero = DAG.getConstant(0, dl, VT);
30582
30583 SDValue ALo, AHi;
30584 if (IsSigned) {
30585 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
30586 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
30587 } else {
30588 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
30589 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
30590 }
30591
30592 SDValue BLo, BHi;
30593 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30594 // If the RHS is a constant, manually unpackl/unpackh and extend.
30595 SmallVector<SDValue, 16> LoOps, HiOps;
30596 for (unsigned i = 0; i != NumElts; i += 16) {
30597 for (unsigned j = 0; j != 8; ++j) {
30598 SDValue LoOp = B.getOperand(i + j);
30599 SDValue HiOp = B.getOperand(i + j + 8);
30600
30601 if (IsSigned) {
30602 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
30603 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
30604 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
30605 DAG.getConstant(8, dl, MVT::i16));
30606 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
30607 DAG.getConstant(8, dl, MVT::i16));
30608 } else {
30609 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
30610 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
30611 }
30612
30613 LoOps.push_back(LoOp);
30614 HiOps.push_back(HiOp);
30615 }
30616 }
30617
30618 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30619 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30620 } else if (IsSigned) {
30621 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
30622 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
30623 } else {
30624 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
30625 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
30626 }
30627
30628 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
30629 // pack back to vXi8.
30630 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
30631 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
30632 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
30633
30634 if (Low)
30635 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30636
30637 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
30638}
30639
30640static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
30641 SelectionDAG &DAG) {
30642 SDLoc dl(Op);
30643 MVT VT = Op.getSimpleValueType();
30644 bool IsSigned = Op->getOpcode() == ISD::MULHS;
30645 unsigned NumElts = VT.getVectorNumElements();
30646 SDValue A = Op.getOperand(0);
30647 SDValue B = Op.getOperand(1);
30648
30649 // Decompose 256-bit ops into 128-bit ops.
30650 if (VT.is256BitVector() && !Subtarget.hasInt256())
30651 return splitVectorIntBinary(Op, DAG);
30652
30653 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30654 return splitVectorIntBinary(Op, DAG);
30655
30656 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
30657 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))
30658 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))
30659 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))
;
30660
30661 // PMULxD operations multiply each even value (starting at 0) of LHS with
30662 // the related value of RHS and produce a widen result.
30663 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30664 // => <2 x i64> <ae|cg>
30665 //
30666 // In other word, to have all the results, we need to perform two PMULxD:
30667 // 1. one with the even values.
30668 // 2. one with the odd values.
30669 // To achieve #2, with need to place the odd values at an even position.
30670 //
30671 // Place the odd value at an even position (basically, shift all values 1
30672 // step to the left):
30673 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
30674 9, -1, 11, -1, 13, -1, 15, -1};
30675 // <a|b|c|d> => <b|undef|d|undef>
30676 SDValue Odd0 =
30677 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
30678 // <e|f|g|h> => <f|undef|h|undef>
30679 SDValue Odd1 =
30680 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
30681
30682 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
30683 // ints.
30684 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
30685 unsigned Opcode =
30686 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
30687 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30688 // => <2 x i64> <ae|cg>
30689 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30690 DAG.getBitcast(MulVT, A),
30691 DAG.getBitcast(MulVT, B)));
30692 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
30693 // => <2 x i64> <bf|dh>
30694 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30695 DAG.getBitcast(MulVT, Odd0),
30696 DAG.getBitcast(MulVT, Odd1)));
30697
30698 // Shuffle it back into the right order.
30699 SmallVector<int, 16> ShufMask(NumElts);
30700 for (int i = 0; i != (int)NumElts; ++i)
30701 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
30702
30703 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
30704
30705 // If we have a signed multiply but no PMULDQ fix up the result of an
30706 // unsigned multiply.
30707 if (IsSigned && !Subtarget.hasSSE41()) {
30708 SDValue Zero = DAG.getConstant(0, dl, VT);
30709 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
30710 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
30711 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
30712 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
30713
30714 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
30715 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
30716 }
30717
30718 return Res;
30719 }
30720
30721 // Only i8 vectors should need custom lowering after this.
30722 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
30723 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
30724 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
;
30725
30726 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
30727 // logical shift down the upper half and pack back to i8.
30728
30729 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
30730 // and then ashr/lshr the upper bits down to the lower bits before multiply.
30731
30732 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30733 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30734 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30735 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30736 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30737 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30738 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30739 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30740 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30741 }
30742
30743 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
30744}
30745
30746// Custom lowering for SMULO/UMULO.
30747static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
30748 SelectionDAG &DAG) {
30749 MVT VT = Op.getSimpleValueType();
30750
30751 // Scalars defer to LowerXALUO.
30752 if (!VT.isVector())
30753 return LowerXALUO(Op, DAG);
30754
30755 SDLoc dl(Op);
30756 bool IsSigned = Op->getOpcode() == ISD::SMULO;
30757 SDValue A = Op.getOperand(0);
30758 SDValue B = Op.getOperand(1);
30759 EVT OvfVT = Op->getValueType(1);
30760
30761 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
30762 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
30763 // Extract the LHS Lo/Hi vectors
30764 SDValue LHSLo, LHSHi;
30765 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
30766
30767 // Extract the RHS Lo/Hi vectors
30768 SDValue RHSLo, RHSHi;
30769 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
30770
30771 EVT LoOvfVT, HiOvfVT;
30772 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
30773 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
30774 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
30775
30776 // Issue the split operations.
30777 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
30778 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
30779
30780 // Join the separate data results and the overflow results.
30781 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30782 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
30783 Hi.getValue(1));
30784
30785 return DAG.getMergeValues({Res, Ovf}, dl);
30786 }
30787
30788 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30789 EVT SetccVT =
30790 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30791
30792 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30793 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30794 unsigned NumElts = VT.getVectorNumElements();
30795 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30796 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30797 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30798 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30799 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30800
30801 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30802
30803 SDValue Ovf;
30804 if (IsSigned) {
30805 SDValue High, LowSign;
30806 if (OvfVT.getVectorElementType() == MVT::i1 &&
30807 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30808 // Rather the truncating try to do the compare on vXi16 or vXi32.
30809 // Shift the high down filling with sign bits.
30810 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
30811 // Fill all 16 bits with the sign bit from the low.
30812 LowSign =
30813 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
30814 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
30815 15, DAG);
30816 SetccVT = OvfVT;
30817 if (!Subtarget.hasBWI()) {
30818 // We can't do a vXi16 compare so sign extend to v16i32.
30819 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30820 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30821 }
30822 } else {
30823 // Otherwise do the compare at vXi8.
30824 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30825 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30826 LowSign =
30827 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30828 }
30829
30830 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30831 } else {
30832 SDValue High =
30833 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30834 if (OvfVT.getVectorElementType() == MVT::i1 &&
30835 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30836 // Rather the truncating try to do the compare on vXi16 or vXi32.
30837 SetccVT = OvfVT;
30838 if (!Subtarget.hasBWI()) {
30839 // We can't do a vXi16 compare so sign extend to v16i32.
30840 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30841 }
30842 } else {
30843 // Otherwise do the compare at vXi8.
30844 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30845 }
30846
30847 Ovf =
30848 DAG.getSetCC(dl, SetccVT, High,
30849 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30850 }
30851
30852 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30853
30854 return DAG.getMergeValues({Low, Ovf}, dl);
30855 }
30856
30857 SDValue Low;
30858 SDValue High =
30859 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30860
30861 SDValue Ovf;
30862 if (IsSigned) {
30863 // SMULO overflows if the high bits don't match the sign of the low.
30864 SDValue LowSign =
30865 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30866 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30867 } else {
30868 // UMULO overflows if the high bits are non-zero.
30869 Ovf =
30870 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30871 }
30872
30873 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30874
30875 return DAG.getMergeValues({Low, Ovf}, dl);
30876}
30877
30878SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30879 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30879, __extension__
__PRETTY_FUNCTION__))
;
30880 EVT VT = Op.getValueType();
30881 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__
__PRETTY_FUNCTION__))
30882 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__
__PRETTY_FUNCTION__))
;
30883
30884 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30885 SmallVector<SDValue> Result;
30886 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30887 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30888 }
30889
30890 RTLIB::Libcall LC;
30891 bool isSigned;
30892 switch (Op->getOpcode()) {
30893 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30893)
;
30894 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30895 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30896 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30897 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30898 }
30899
30900 SDLoc dl(Op);
30901 SDValue InChain = DAG.getEntryNode();
30902
30903 TargetLowering::ArgListTy Args;
30904 TargetLowering::ArgListEntry Entry;
30905 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30906 EVT ArgVT = Op->getOperand(i).getValueType();
30907 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__
__PRETTY_FUNCTION__))
30908 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__
__PRETTY_FUNCTION__))
;
30909 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30910 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30911 MachinePointerInfo MPI =
30912 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30913 Entry.Node = StackPtr;
30914 InChain =
30915 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30916 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30917 Entry.Ty = PointerType::get(ArgTy,0);
30918 Entry.IsSExt = false;
30919 Entry.IsZExt = false;
30920 Args.push_back(Entry);
30921 }
30922
30923 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
30924 getPointerTy(DAG.getDataLayout()));
30925
30926 TargetLowering::CallLoweringInfo CLI(DAG);
30927 CLI.setDebugLoc(dl)
30928 .setChain(InChain)
30929 .setLibCallee(
30930 getLibcallCallingConv(LC),
30931 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30932 std::move(Args))
30933 .setInRegister()
30934 .setSExtResult(isSigned)
30935 .setZExtResult(!isSigned);
30936
30937 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30938 return DAG.getBitcast(VT, CallInfo.first);
30939}
30940
30941SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30942 SelectionDAG &DAG,
30943 SDValue &Chain) const {
30944 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30944, __extension__
__PRETTY_FUNCTION__))
;
30945 EVT VT = Op.getValueType();
30946 bool IsStrict = Op->isStrictFPOpcode();
30947
30948 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30949 EVT ArgVT = Arg.getValueType();
30950
30951 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__
__PRETTY_FUNCTION__))
30952 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__
__PRETTY_FUNCTION__))
;
30953
30954 RTLIB::Libcall LC;
30955 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30956 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30957 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30958 else
30959 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30960 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30960, __extension__
__PRETTY_FUNCTION__))
;
30961
30962 SDLoc dl(Op);
30963 MakeLibCallOptions CallOptions;
30964 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30965
30966 SDValue Result;
30967 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30968 // expected VT (i128).
30969 std::tie(Result, Chain) =
30970 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30971 Result = DAG.getBitcast(VT, Result);
30972 return Result;
30973}
30974
30975SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30976 SelectionDAG &DAG) const {
30977 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30977, __extension__
__PRETTY_FUNCTION__))
;
30978 EVT VT = Op.getValueType();
30979 bool IsStrict = Op->isStrictFPOpcode();
30980
30981 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30982 EVT ArgVT = Arg.getValueType();
30983
30984 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__
__PRETTY_FUNCTION__))
30985 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__
__PRETTY_FUNCTION__))
;
30986
30987 RTLIB::Libcall LC;
30988 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30989 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30990 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30991 else
30992 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30993 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30993, __extension__
__PRETTY_FUNCTION__))
;
30994
30995 SDLoc dl(Op);
30996 MakeLibCallOptions CallOptions;
30997 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30998
30999 // Pass the i128 argument as an indirect argument on the stack.
31000 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
31001 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31002 MachinePointerInfo MPI =
31003 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31004 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
31005
31006 SDValue Result;
31007 std::tie(Result, Chain) =
31008 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
31009 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
31010}
31011
31012// Return true if the required (according to Opcode) shift-imm form is natively
31013// supported by the Subtarget
31014static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
31015 unsigned Opcode) {
31016 if (!VT.isSimple())
31017 return false;
31018
31019 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
31020 return false;
31021
31022 if (VT.getScalarSizeInBits() < 16)
31023 return false;
31024
31025 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
31026 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
31027 return true;
31028
31029 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
31030 (VT.is256BitVector() && Subtarget.hasInt256());
31031
31032 bool AShift = LShift && (Subtarget.hasAVX512() ||
31033 (VT != MVT::v2i64 && VT != MVT::v4i64));
31034 return (Opcode == ISD::SRA) ? AShift : LShift;
31035}
31036
31037// The shift amount is a variable, but it is the same for all vector lanes.
31038// These instructions are defined together with shift-immediate.
31039static
31040bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
31041 unsigned Opcode) {
31042 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
31043}
31044
31045// Return true if the required (according to Opcode) variable-shift form is
31046// natively supported by the Subtarget
31047static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
31048 unsigned Opcode) {
31049 if (!VT.isSimple())
31050 return false;
31051
31052 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
31053 return false;
31054
31055 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
31056 return false;
31057
31058 // vXi16 supported only on AVX-512, BWI
31059 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
31060 return false;
31061
31062 if (Subtarget.hasAVX512() &&
31063 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
31064 return true;
31065
31066 bool LShift = VT.is128BitVector() || VT.is256BitVector();
31067 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
31068 return (Opcode == ISD::SRA) ? AShift : LShift;
31069}
31070
31071static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
31072 const X86Subtarget &Subtarget) {
31073 MVT VT = Op.getSimpleValueType();
31074 SDLoc dl(Op);
31075 SDValue R = Op.getOperand(0);
31076 SDValue Amt = Op.getOperand(1);
31077 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
31078
31079 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
31080 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31080, __extension__
__PRETTY_FUNCTION__))
;
31081 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
31082 SDValue Ex = DAG.getBitcast(ExVT, R);
31083
31084 // ashr(R, 63) === cmp_slt(R, 0)
31085 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
31086 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__
__PRETTY_FUNCTION__))
31087 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__
__PRETTY_FUNCTION__))
;
31088 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
31089 }
31090
31091 if (ShiftAmt >= 32) {
31092 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
31093 SDValue Upper =
31094 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
31095 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
31096 ShiftAmt - 32, DAG);
31097 if (VT == MVT::v2i64)
31098 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
31099 if (VT == MVT::v4i64)
31100 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
31101 {9, 1, 11, 3, 13, 5, 15, 7});
31102 } else {
31103 // SRA upper i32, SRL whole i64 and select lower i32.
31104 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
31105 ShiftAmt, DAG);
31106 SDValue Lower =
31107 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
31108 Lower = DAG.getBitcast(ExVT, Lower);
31109 if (VT == MVT::v2i64)
31110 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
31111 if (VT == MVT::v4i64)
31112 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
31113 {8, 1, 10, 3, 12, 5, 14, 7});
31114 }
31115 return DAG.getBitcast(VT, Ex);
31116 };
31117
31118 // Optimize shl/srl/sra with constant shift amount.
31119 APInt APIntShiftAmt;
31120 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
31121 return SDValue();
31122
31123 // If the shift amount is out of range, return undef.
31124 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
31125 return DAG.getUNDEF(VT);
31126
31127 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
31128
31129 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
31130 // Hardware support for vector shifts is sparse which makes us scalarize the
31131 // vector operations in many cases. Also, on sandybridge ADD is faster than
31132 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
31133 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
31134 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
31135 // must be 0). (add undef, undef) however can be any value. To make this
31136 // safe, we must freeze R to ensure that register allocation uses the same
31137 // register for an undefined value. This ensures that the result will
31138 // still be even and preserves the original semantics.
31139 R = DAG.getFreeze(R);
31140 return DAG.getNode(ISD::ADD, dl, VT, R, R);
31141 }
31142
31143 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
31144 }
31145
31146 // i64 SRA needs to be performed as partial shifts.
31147 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
31148 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
31149 Op.getOpcode() == ISD::SRA)
31150 return ArithmeticShiftRight64(ShiftAmt);
31151
31152 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31153 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
31154 unsigned NumElts = VT.getVectorNumElements();
31155 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31156
31157 // Simple i8 add case
31158 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
31159 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
31160 // must be 0). (add undef, undef) however can be any value. To make this
31161 // safe, we must freeze R to ensure that register allocation uses the same
31162 // register for an undefined value. This ensures that the result will
31163 // still be even and preserves the original semantics.
31164 R = DAG.getFreeze(R);
31165 return DAG.getNode(ISD::ADD, dl, VT, R, R);
31166 }
31167
31168 // ashr(R, 7) === cmp_slt(R, 0)
31169 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
31170 SDValue Zeros = DAG.getConstant(0, dl, VT);
31171 if (VT.is512BitVector()) {
31172 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31172, __extension__
__PRETTY_FUNCTION__))
;
31173 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
31174 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
31175 }
31176 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
31177 }
31178
31179 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
31180 if (VT == MVT::v16i8 && Subtarget.hasXOP())
31181 return SDValue();
31182
31183 if (Op.getOpcode() == ISD::SHL) {
31184 // Make a large shift.
31185 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
31186 ShiftAmt, DAG);
31187 SHL = DAG.getBitcast(VT, SHL);
31188 // Zero out the rightmost bits.
31189 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
31190 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
31191 }
31192 if (Op.getOpcode() == ISD::SRL) {
31193 // Make a large shift.
31194 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
31195 ShiftAmt, DAG);
31196 SRL = DAG.getBitcast(VT, SRL);
31197 // Zero out the leftmost bits.
31198 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
31199 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
31200 }
31201 if (Op.getOpcode() == ISD::SRA) {
31202 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
31203 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31204
31205 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
31206 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
31207 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
31208 return Res;
31209 }
31210 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31210)
;
31211 }
31212
31213 return SDValue();
31214}
31215
31216static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
31217 const X86Subtarget &Subtarget) {
31218 MVT VT = Op.getSimpleValueType();
31219 SDLoc dl(Op);
31220 SDValue R = Op.getOperand(0);
31221 SDValue Amt = Op.getOperand(1);
31222 unsigned Opcode = Op.getOpcode();
31223 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
31224
31225 int BaseShAmtIdx = -1;
31226 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
31227 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
31228 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
31229 Subtarget, DAG);
31230
31231 // vXi8 shifts - shift as v8i16 + mask result.
31232 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
31233 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
31234 VT == MVT::v64i8) &&
31235 !Subtarget.hasXOP()) {
31236 unsigned NumElts = VT.getVectorNumElements();
31237 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31238 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
31239 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
31240 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
31241
31242 // Create the mask using vXi16 shifts. For shift-rights we need to move
31243 // the upper byte down before splatting the vXi8 mask.
31244 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
31245 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
31246 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
31247 if (Opcode != ISD::SHL)
31248 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
31249 8, DAG);
31250 BitMask = DAG.getBitcast(VT, BitMask);
31251 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
31252 SmallVector<int, 64>(NumElts, 0));
31253
31254 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
31255 DAG.getBitcast(ExtVT, R), BaseShAmt,
31256 BaseShAmtIdx, Subtarget, DAG);
31257 Res = DAG.getBitcast(VT, Res);
31258 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
31259
31260 if (Opcode == ISD::SRA) {
31261 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
31262 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
31263 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
31264 SignMask =
31265 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
31266 BaseShAmtIdx, Subtarget, DAG);
31267 SignMask = DAG.getBitcast(VT, SignMask);
31268 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
31269 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
31270 }
31271 return Res;
31272 }
31273 }
31274 }
31275
31276 return SDValue();
31277}
31278
31279// Convert a shift/rotate left amount to a multiplication scale factor.
31280static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
31281 const X86Subtarget &Subtarget,
31282 SelectionDAG &DAG) {
31283 MVT VT = Amt.getSimpleValueType();
31284 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
31285 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
31286 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
31287 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
31288 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31289 (Subtarget.hasBWI() && VT == MVT::v64i8)))
31290 return SDValue();
31291
31292 MVT SVT = VT.getVectorElementType();
31293 unsigned SVTBits = SVT.getSizeInBits();
31294 unsigned NumElems = VT.getVectorNumElements();
31295
31296 APInt UndefElts;
31297 SmallVector<APInt> EltBits;
31298 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
31299 APInt One(SVTBits, 1);
31300 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
31301 for (unsigned I = 0; I != NumElems; ++I) {
31302 if (UndefElts[I] || EltBits[I].uge(SVTBits))
31303 continue;
31304 uint64_t ShAmt = EltBits[I].getZExtValue();
31305 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
31306 }
31307 return DAG.getBuildVector(VT, dl, Elts);
31308 }
31309
31310 // If the target doesn't support variable shifts, use either FP conversion
31311 // or integer multiplication to avoid shifting each element individually.
31312 if (VT == MVT::v4i32) {
31313 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
31314 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
31315 DAG.getConstant(0x3f800000U, dl, VT));
31316 Amt = DAG.getBitcast(MVT::v4f32, Amt);
31317 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
31318 }
31319
31320 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
31321 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
31322 SDValue Z = DAG.getConstant(0, dl, VT);
31323 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
31324 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
31325 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
31326 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
31327 if (Subtarget.hasSSE41())
31328 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31329 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
31330 }
31331
31332 return SDValue();
31333}
31334
31335static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
31336 SelectionDAG &DAG) {
31337 MVT VT = Op.getSimpleValueType();
31338 SDLoc dl(Op);
31339 SDValue R = Op.getOperand(0);
31340 SDValue Amt = Op.getOperand(1);
31341 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31342 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31343
31344 unsigned Opc = Op.getOpcode();
31345 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
31346 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
31347
31348 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31348, __extension__
__PRETTY_FUNCTION__))
;
31349 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31349, __extension__
__PRETTY_FUNCTION__))
;
31350
31351 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
31352 return V;
31353
31354 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
31355 return V;
31356
31357 if (supportedVectorVarShift(VT, Subtarget, Opc))
31358 return Op;
31359
31360 // i64 vector arithmetic shift can be emulated with the transform:
31361 // M = lshr(SIGN_MASK, Amt)
31362 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
31363 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
31364 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
31365 Opc == ISD::SRA) {
31366 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
31367 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
31368 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31369 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
31370 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
31371 return R;
31372 }
31373
31374 // XOP has 128-bit variable logical/arithmetic shifts.
31375 // +ve/-ve Amt = shift left/right.
31376 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
31377 VT == MVT::v8i16 || VT == MVT::v16i8)) {
31378 if (Opc == ISD::SRL || Opc == ISD::SRA) {
31379 SDValue Zero = DAG.getConstant(0, dl, VT);
31380 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
31381 }
31382 if (Opc == ISD::SHL || Opc == ISD::SRL)
31383 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
31384 if (Opc == ISD::SRA)
31385 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
31386 }
31387
31388 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
31389 // shifts per-lane and then shuffle the partial results back together.
31390 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
31391 // Splat the shift amounts so the scalar shifts above will catch it.
31392 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
31393 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
31394 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
31395 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
31396 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
31397 }
31398
31399 // If possible, lower this shift as a sequence of two shifts by
31400 // constant plus a BLENDing shuffle instead of scalarizing it.
31401 // Example:
31402 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
31403 //
31404 // Could be rewritten as:
31405 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
31406 //
31407 // The advantage is that the two shifts from the example would be
31408 // lowered as X86ISD::VSRLI nodes in parallel before blending.
31409 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
31410 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31411 SDValue Amt1, Amt2;
31412 unsigned NumElts = VT.getVectorNumElements();
31413 SmallVector<int, 8> ShuffleMask;
31414 for (unsigned i = 0; i != NumElts; ++i) {
31415 SDValue A = Amt->getOperand(i);
31416 if (A.isUndef()) {
31417 ShuffleMask.push_back(SM_SentinelUndef);
31418 continue;
31419 }
31420 if (!Amt1 || Amt1 == A) {
31421 ShuffleMask.push_back(i);
31422 Amt1 = A;
31423 continue;
31424 }
31425 if (!Amt2 || Amt2 == A) {
31426 ShuffleMask.push_back(i + NumElts);
31427 Amt2 = A;
31428 continue;
31429 }
31430 break;
31431 }
31432
31433 // Only perform this blend if we can perform it without loading a mask.
31434 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
31435 (VT != MVT::v16i16 ||
31436 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
31437 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
31438 canWidenShuffleElements(ShuffleMask))) {
31439 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
31440 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
31441 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
31442 Cst2->getAPIntValue().ult(EltSizeInBits)) {
31443 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31444 Cst1->getZExtValue(), DAG);
31445 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31446 Cst2->getZExtValue(), DAG);
31447 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
31448 }
31449 }
31450 }
31451
31452 // If possible, lower this packed shift into a vector multiply instead of
31453 // expanding it into a sequence of scalar shifts.
31454 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
31455 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
31456 Subtarget.canExtendTo512BW())))
31457 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
31458 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
31459
31460 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
31461 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
31462 if (Opc == ISD::SRL && ConstantAmt &&
31463 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31464 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31465 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31466 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31467 SDValue Zero = DAG.getConstant(0, dl, VT);
31468 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
31469 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
31470 return DAG.getSelect(dl, VT, ZAmt, R, Res);
31471 }
31472 }
31473
31474 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
31475 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
31476 // TODO: Special case handling for shift by 0/1, really we can afford either
31477 // of these cases in pre-SSE41/XOP/AVX512 but not both.
31478 if (Opc == ISD::SRA && ConstantAmt &&
31479 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
31480 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
31481 !Subtarget.hasAVX512()) ||
31482 DAG.isKnownNeverZero(Amt))) {
31483 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31484 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31485 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31486 SDValue Amt0 =
31487 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
31488 SDValue Amt1 =
31489 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
31490 SDValue Sra1 =
31491 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
31492 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
31493 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
31494 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
31495 }
31496 }
31497
31498 // v4i32 Non Uniform Shifts.
31499 // If the shift amount is constant we can shift each lane using the SSE2
31500 // immediate shifts, else we need to zero-extend each lane to the lower i64
31501 // and shift using the SSE2 variable shifts.
31502 // The separate results can then be blended together.
31503 if (VT == MVT::v4i32) {
31504 SDValue Amt0, Amt1, Amt2, Amt3;
31505 if (ConstantAmt) {
31506 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
31507 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
31508 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
31509 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
31510 } else {
31511 // The SSE2 shifts use the lower i64 as the same shift amount for
31512 // all lanes and the upper i64 is ignored. On AVX we're better off
31513 // just zero-extending, but for SSE just duplicating the top 16-bits is
31514 // cheaper and has the same effect for out of range values.
31515 if (Subtarget.hasAVX()) {
31516 SDValue Z = DAG.getConstant(0, dl, VT);
31517 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
31518 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
31519 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
31520 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
31521 } else {
31522 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
31523 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
31524 {4, 5, 6, 7, -1, -1, -1, -1});
31525 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
31526 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
31527 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
31528 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
31529 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
31530 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
31531 }
31532 }
31533
31534 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
31535 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
31536 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
31537 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
31538 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
31539
31540 // Merge the shifted lane results optimally with/without PBLENDW.
31541 // TODO - ideally shuffle combining would handle this.
31542 if (Subtarget.hasSSE41()) {
31543 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
31544 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
31545 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
31546 }
31547 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
31548 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
31549 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
31550 }
31551
31552 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
31553 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
31554 // make the existing SSE solution better.
31555 // NOTE: We honor prefered vector width before promoting to 512-bits.
31556 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
31557 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
31558 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
31559 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
31560 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
31561 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__
__PRETTY_FUNCTION__))
31562 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__
__PRETTY_FUNCTION__))
;
31563 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
31564 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
31565 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31566 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
31567 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
31568 return DAG.getNode(ISD::TRUNCATE, dl, VT,
31569 DAG.getNode(Opc, dl, ExtVT, R, Amt));
31570 }
31571
31572 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
31573 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
31574 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
31575 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
31576 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
31577 !Subtarget.hasXOP()) {
31578 int NumElts = VT.getVectorNumElements();
31579 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
31580
31581 // Extend constant shift amount to vXi16 (it doesn't matter if the type
31582 // isn't legal).
31583 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
31584 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
31585 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
31586 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
31587 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__
__PRETTY_FUNCTION__))
31588 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__
__PRETTY_FUNCTION__))
;
31589
31590 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
31591 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
31592 : DAG.getZExtOrTrunc(R, dl, ExVT);
31593 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
31594 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
31595 return DAG.getZExtOrTrunc(R, dl, VT);
31596 }
31597
31598 SmallVector<SDValue, 16> LoAmt, HiAmt;
31599 for (int i = 0; i != NumElts; i += 16) {
31600 for (int j = 0; j != 8; ++j) {
31601 LoAmt.push_back(Amt.getOperand(i + j));
31602 HiAmt.push_back(Amt.getOperand(i + j + 8));
31603 }
31604 }
31605
31606 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
31607 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
31608 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
31609
31610 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
31611 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
31612 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
31613 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
31614 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
31615 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
31616 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
31617 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
31618 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
31619 }
31620
31621 if (VT == MVT::v16i8 ||
31622 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31623 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31624 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
31625
31626 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31627 if (VT.is512BitVector()) {
31628 // On AVX512BW targets we make use of the fact that VSELECT lowers
31629 // to a masked blend which selects bytes based just on the sign bit
31630 // extracted to a mask.
31631 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
31632 V0 = DAG.getBitcast(VT, V0);
31633 V1 = DAG.getBitcast(VT, V1);
31634 Sel = DAG.getBitcast(VT, Sel);
31635 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31636 ISD::SETGT);
31637 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31638 } else if (Subtarget.hasSSE41()) {
31639 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31640 // on the sign bit.
31641 V0 = DAG.getBitcast(VT, V0);
31642 V1 = DAG.getBitcast(VT, V1);
31643 Sel = DAG.getBitcast(VT, Sel);
31644 return DAG.getBitcast(SelVT,
31645 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31646 }
31647 // On pre-SSE41 targets we test for the sign bit by comparing to
31648 // zero - a negative value will set all bits of the lanes to true
31649 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31650 SDValue Z = DAG.getConstant(0, dl, SelVT);
31651 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31652 return DAG.getSelect(dl, SelVT, C, V0, V1);
31653 };
31654
31655 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31656 // We can safely do this using i16 shifts as we're only interested in
31657 // the 3 lower bits of each byte.
31658 Amt = DAG.getBitcast(ExtVT, Amt);
31659 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31660 Amt = DAG.getBitcast(VT, Amt);
31661
31662 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31663 // r = VSELECT(r, shift(r, 4), a);
31664 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31665 R = SignBitSelect(VT, Amt, M, R);
31666
31667 // a += a
31668 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31669
31670 // r = VSELECT(r, shift(r, 2), a);
31671 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31672 R = SignBitSelect(VT, Amt, M, R);
31673
31674 // a += a
31675 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31676
31677 // return VSELECT(r, shift(r, 1), a);
31678 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31679 R = SignBitSelect(VT, Amt, M, R);
31680 return R;
31681 }
31682
31683 if (Opc == ISD::SRA) {
31684 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31685 // so we can correctly sign extend. We don't care what happens to the
31686 // lower byte.
31687 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31688 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31689 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31690 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31691 ALo = DAG.getBitcast(ExtVT, ALo);
31692 AHi = DAG.getBitcast(ExtVT, AHi);
31693 RLo = DAG.getBitcast(ExtVT, RLo);
31694 RHi = DAG.getBitcast(ExtVT, RHi);
31695
31696 // r = VSELECT(r, shift(r, 4), a);
31697 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31698 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31699 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31700 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31701
31702 // a += a
31703 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31704 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31705
31706 // r = VSELECT(r, shift(r, 2), a);
31707 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31708 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31709 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31710 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31711
31712 // a += a
31713 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31714 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31715
31716 // r = VSELECT(r, shift(r, 1), a);
31717 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31718 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31719 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31720 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31721
31722 // Logical shift the result back to the lower byte, leaving a zero upper
31723 // byte meaning that we can safely pack with PACKUSWB.
31724 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31725 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31726 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31727 }
31728 }
31729
31730 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31731 MVT ExtVT = MVT::v8i32;
31732 SDValue Z = DAG.getConstant(0, dl, VT);
31733 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31734 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31735 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31736 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31737 ALo = DAG.getBitcast(ExtVT, ALo);
31738 AHi = DAG.getBitcast(ExtVT, AHi);
31739 RLo = DAG.getBitcast(ExtVT, RLo);
31740 RHi = DAG.getBitcast(ExtVT, RHi);
31741 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31742 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31743 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31744 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31745 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31746 }
31747
31748 if (VT == MVT::v8i16) {
31749 // If we have a constant shift amount, the non-SSE41 path is best as
31750 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31751 bool UseSSE41 = Subtarget.hasSSE41() &&
31752 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31753
31754 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31755 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31756 // the sign bit.
31757 if (UseSSE41) {
31758 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
31759 V0 = DAG.getBitcast(ExtVT, V0);
31760 V1 = DAG.getBitcast(ExtVT, V1);
31761 Sel = DAG.getBitcast(ExtVT, Sel);
31762 return DAG.getBitcast(
31763 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31764 }
31765 // On pre-SSE41 targets we splat the sign bit - a negative value will
31766 // set all bits of the lanes to true and VSELECT uses that in
31767 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31768 SDValue C =
31769 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31770 return DAG.getSelect(dl, VT, C, V0, V1);
31771 };
31772
31773 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31774 if (UseSSE41) {
31775 // On SSE41 targets we need to replicate the shift mask in both
31776 // bytes for PBLENDVB.
31777 Amt = DAG.getNode(
31778 ISD::OR, dl, VT,
31779 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31780 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31781 } else {
31782 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31783 }
31784
31785 // r = VSELECT(r, shift(r, 8), a);
31786 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31787 R = SignBitSelect(Amt, M, R);
31788
31789 // a += a
31790 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31791
31792 // r = VSELECT(r, shift(r, 4), a);
31793 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31794 R = SignBitSelect(Amt, M, R);
31795
31796 // a += a
31797 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31798
31799 // r = VSELECT(r, shift(r, 2), a);
31800 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31801 R = SignBitSelect(Amt, M, R);
31802
31803 // a += a
31804 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31805
31806 // return VSELECT(r, shift(r, 1), a);
31807 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31808 R = SignBitSelect(Amt, M, R);
31809 return R;
31810 }
31811
31812 // Decompose 256-bit shifts into 128-bit shifts.
31813 if (VT.is256BitVector())
31814 return splitVectorIntBinary(Op, DAG);
31815
31816 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31817 return splitVectorIntBinary(Op, DAG);
31818
31819 return SDValue();
31820}
31821
31822static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
31823 SelectionDAG &DAG) {
31824 MVT VT = Op.getSimpleValueType();
31825 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__
__PRETTY_FUNCTION__))
31826 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__
__PRETTY_FUNCTION__))
;
31827
31828 SDLoc DL(Op);
31829 SDValue Op0 = Op.getOperand(0);
31830 SDValue Op1 = Op.getOperand(1);
31831 SDValue Amt = Op.getOperand(2);
31832 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31833 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31834
31835 if (VT.isVector()) {
31836 APInt APIntShiftAmt;
31837 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31838
31839 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31840 if (IsFSHR)
31841 std::swap(Op0, Op1);
31842
31843 if (IsCstSplat) {
31844 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31845 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31846 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31847 {Op0, Op1, Imm}, DAG, Subtarget);
31848 }
31849 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31850 {Op0, Op1, Amt}, DAG, Subtarget);
31851 }
31852 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
31853 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
31854 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
31855 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
;
31856
31857 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31858 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31859 if (IsCstSplat)
31860 return SDValue();
31861
31862 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31863 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31864 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31865
31866 // Constant vXi16 funnel shifts can be efficiently handled by default.
31867 if (IsCst && EltSizeInBits == 16)
31868 return SDValue();
31869
31870 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31871 unsigned NumElts = VT.getVectorNumElements();
31872 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31873 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31874
31875 // Split 256-bit integers on XOP/pre-AVX2 targets.
31876 // Split 512-bit integers on non 512-bit BWI targets.
31877 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31878 !Subtarget.hasAVX2())) ||
31879 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31880 EltSizeInBits < 32)) {
31881 // Pre-mask the amount modulo using the wider vector.
31882 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31883 return splitVectorOp(Op, DAG);
31884 }
31885
31886 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31887 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31888 int ScalarAmtIdx = -1;
31889 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31890 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31891 if (EltSizeInBits == 16)
31892 return SDValue();
31893
31894 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31895 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31896 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31897 ScalarAmtIdx, Subtarget, DAG);
31898 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31899 ScalarAmtIdx, Subtarget, DAG);
31900 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31901 }
31902 }
31903
31904 MVT WideSVT = MVT::getIntegerVT(
31905 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31906 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31907
31908 // If per-element shifts are legal, fallback to generic expansion.
31909 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31910 return SDValue();
31911
31912 // Attempt to fold as:
31913 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31914 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31915 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31916 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31917 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31918 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31919 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31920 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31921 EltSizeInBits, DAG);
31922 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31923 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31924 if (!IsFSHR)
31925 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31926 EltSizeInBits, DAG);
31927 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31928 }
31929
31930 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31931 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31932 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31933 SDValue Z = DAG.getConstant(0, DL, VT);
31934 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31935 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31936 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31937 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31938 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31939 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31940 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31941 }
31942
31943 // Fallback to generic expansion.
31944 return SDValue();
31945 }
31946 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))
31947 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))
31948 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))
;
31949
31950 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31951 bool OptForSize = DAG.shouldOptForSize();
31952 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31953
31954 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31955 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31956 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31957 !isa<ConstantSDNode>(Amt)) {
31958 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31959 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31960 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31961 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31962 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31963 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31964 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31965 if (IsFSHR) {
31966 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31967 } else {
31968 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31969 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31970 }
31971 return DAG.getZExtOrTrunc(Res, DL, VT);
31972 }
31973
31974 if (VT == MVT::i8 || ExpandFunnel)
31975 return SDValue();
31976
31977 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31978 if (VT == MVT::i16) {
31979 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31980 DAG.getConstant(15, DL, Amt.getValueType()));
31981 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31982 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31983 }
31984
31985 return Op;
31986}
31987
31988static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31989 SelectionDAG &DAG) {
31990 MVT VT = Op.getSimpleValueType();
31991 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31991, __extension__
__PRETTY_FUNCTION__))
;
31992
31993 SDLoc DL(Op);
31994 SDValue R = Op.getOperand(0);
31995 SDValue Amt = Op.getOperand(1);
31996 unsigned Opcode = Op.getOpcode();
31997 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31998 int NumElts = VT.getVectorNumElements();
31999 bool IsROTL = Opcode == ISD::ROTL;
32000
32001 // Check for constant splat rotation amount.
32002 APInt CstSplatValue;
32003 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
32004
32005 // Check for splat rotate by zero.
32006 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
32007 return R;
32008
32009 // AVX512 implicitly uses modulo rotation amounts.
32010 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
32011 // Attempt to rotate by immediate.
32012 if (IsCstSplat) {
32013 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
32014 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
32015 return DAG.getNode(RotOpc, DL, VT, R,
32016 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
32017 }
32018
32019 // Else, fall-back on VPROLV/VPRORV.
32020 return Op;
32021 }
32022
32023 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
32024 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
32025 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
32026 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
32027 }
32028
32029 SDValue Z = DAG.getConstant(0, DL, VT);
32030
32031 if (!IsROTL) {
32032 // If the ISD::ROTR amount is constant, we're always better converting to
32033 // ISD::ROTL.
32034 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
32035 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
32036
32037 // XOP targets always prefers ISD::ROTL.
32038 if (Subtarget.hasXOP())
32039 return DAG.getNode(ISD::ROTL, DL, VT, R,
32040 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
32041 }
32042
32043 // Split 256-bit integers on XOP/pre-AVX2 targets.
32044 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
32045 return splitVectorIntBinary(Op, DAG);
32046
32047 // XOP has 128-bit vector variable + immediate rotates.
32048 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
32049 // XOP implicitly uses modulo rotation amounts.
32050 if (Subtarget.hasXOP()) {
32051 assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__))
;
32052 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32052, __extension__
__PRETTY_FUNCTION__))
;
32053
32054 // Attempt to rotate by immediate.
32055 if (IsCstSplat) {
32056 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
32057 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
32058 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
32059 }
32060
32061 // Use general rotate by variable (per-element).
32062 return Op;
32063 }
32064
32065 // Rotate by an uniform constant - expand back to shifts.
32066 if (IsCstSplat)
32067 return SDValue();
32068
32069 // Split 512-bit integers on non 512-bit BWI targets.
32070 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
32071 return splitVectorIntBinary(Op, DAG);
32072
32073 assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32074 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32075 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32076 Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32077 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32078 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
;
32079
32080 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
32081 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
32082
32083 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
32084 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32085
32086 // Attempt to fold as unpack(x,x) << zext(splat(y)):
32087 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
32088 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
32089 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
32090 int BaseRotAmtIdx = -1;
32091 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
32092 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
32093 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
32094 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
32095 }
32096 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
32097 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
32098 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
32099 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
32100 BaseRotAmtIdx, Subtarget, DAG);
32101 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
32102 BaseRotAmtIdx, Subtarget, DAG);
32103 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
32104 }
32105 }
32106
32107 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
32108 // the amount bit.
32109 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
32110 if (EltSizeInBits == 8) {
32111 bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
32112 MVT WideVT =
32113 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
32114 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
32115
32116 // Attempt to fold as:
32117 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
32118 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
32119 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
32120 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
32121 // If we're rotating by constant, just use default promotion.
32122 if (IsConstAmt)
32123 return SDValue();
32124 // See if we can perform this by widening to vXi16 or vXi32.
32125 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
32126 R = DAG.getNode(
32127 ISD::OR, DL, WideVT, R,
32128 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
32129 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
32130 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
32131 if (IsROTL)
32132 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
32133 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
32134 }
32135
32136 // Attempt to fold as unpack(x,x) << zext(y):
32137 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
32138 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
32139 if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
32140 // See if we can perform this by unpacking to lo/hi vXi16.
32141 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
32142 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
32143 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
32144 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
32145 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
32146 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
32147 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
32148 }
32149 assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32149, __extension__
__PRETTY_FUNCTION__))
;
32150
32151 // We don't need ModuloAmt here as we just peek at individual bits.
32152 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
32153 if (Subtarget.hasSSE41()) {
32154 // On SSE41 targets we can use PBLENDVB which selects bytes based just
32155 // on the sign bit.
32156 V0 = DAG.getBitcast(VT, V0);
32157 V1 = DAG.getBitcast(VT, V1);
32158 Sel = DAG.getBitcast(VT, Sel);
32159 return DAG.getBitcast(SelVT,
32160 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
32161 }
32162 // On pre-SSE41 targets we test for the sign bit by comparing to
32163 // zero - a negative value will set all bits of the lanes to true
32164 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
32165 SDValue Z = DAG.getConstant(0, DL, SelVT);
32166 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
32167 return DAG.getSelect(DL, SelVT, C, V0, V1);
32168 };
32169
32170 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
32171 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
32172 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32173 IsROTL = true;
32174 }
32175
32176 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
32177 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
32178
32179 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
32180 // We can safely do this using i16 shifts as we're only interested in
32181 // the 3 lower bits of each byte.
32182 Amt = DAG.getBitcast(ExtVT, Amt);
32183 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
32184 Amt = DAG.getBitcast(VT, Amt);
32185
32186 // r = VSELECT(r, rot(r, 4), a);
32187 SDValue M;
32188 M = DAG.getNode(
32189 ISD::OR, DL, VT,
32190 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
32191 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
32192 R = SignBitSelect(VT, Amt, M, R);
32193
32194 // a += a
32195 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32196
32197 // r = VSELECT(r, rot(r, 2), a);
32198 M = DAG.getNode(
32199 ISD::OR, DL, VT,
32200 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
32201 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
32202 R = SignBitSelect(VT, Amt, M, R);
32203
32204 // a += a
32205 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32206
32207 // return VSELECT(r, rot(r, 1), a);
32208 M = DAG.getNode(
32209 ISD::OR, DL, VT,
32210 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
32211 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
32212 return SignBitSelect(VT, Amt, M, R);
32213 }
32214
32215 bool IsSplatAmt = DAG.isSplatValue(Amt);
32216 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
32217 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
32218 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
32219
32220 // Fallback for splats + all supported variable shifts.
32221 // Fallback for non-constants AVX2 vXi16 as well.
32222 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
32223 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32224 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
32225 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
32226 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
32227 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
32228 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
32229 }
32230
32231 // Everything below assumes ISD::ROTL.
32232 if (!IsROTL) {
32233 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32234 IsROTL = true;
32235 }
32236
32237 // ISD::ROT* uses modulo rotate amounts.
32238 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32239
32240 assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32240, __extension__
__PRETTY_FUNCTION__))
;
32241
32242 // As with shifts, attempt to convert the rotation amount to a multiplication
32243 // factor, fallback to general expansion.
32244 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
32245 if (!Scale)
32246 return SDValue();
32247
32248 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
32249 if (EltSizeInBits == 16) {
32250 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
32251 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
32252 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32253 }
32254
32255 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
32256 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
32257 // that can then be OR'd with the lower 32-bits.
32258 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32258, __extension__
__PRETTY_FUNCTION__))
;
32259 static const int OddMask[] = {1, -1, 3, -1};
32260 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
32261 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
32262
32263 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32264 DAG.getBitcast(MVT::v2i64, R),
32265 DAG.getBitcast(MVT::v2i64, Scale));
32266 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32267 DAG.getBitcast(MVT::v2i64, R13),
32268 DAG.getBitcast(MVT::v2i64, Scale13));
32269 Res02 = DAG.getBitcast(VT, Res02);
32270 Res13 = DAG.getBitcast(VT, Res13);
32271
32272 return DAG.getNode(ISD::OR, DL, VT,
32273 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
32274 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
32275}
32276
32277/// Returns true if the operand type is exactly twice the native width, and
32278/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
32279/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
32280/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
32281bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
32282 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
32283
32284 if (OpWidth == 64)
32285 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
32286 if (OpWidth == 128)
32287 return Subtarget.canUseCMPXCHG16B();
32288
32289 return false;
32290}
32291
32292TargetLoweringBase::AtomicExpansionKind
32293X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
32294 Type *MemType = SI->getValueOperand()->getType();
32295
32296 bool NoImplicitFloatOps =
32297 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32298 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32299 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32300 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32301 return AtomicExpansionKind::None;
32302
32303 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
32304 : AtomicExpansionKind::None;
32305}
32306
32307// Note: this turns large loads into lock cmpxchg8b/16b.
32308// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
32309TargetLowering::AtomicExpansionKind
32310X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
32311 Type *MemType = LI->getType();
32312
32313 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
32314 // can use movq to do the load. If we have X87 we can load into an 80-bit
32315 // X87 register and store it to a stack temporary.
32316 bool NoImplicitFloatOps =
32317 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32318 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32319 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32320 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32321 return AtomicExpansionKind::None;
32322
32323 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32324 : AtomicExpansionKind::None;
32325}
32326
32327enum BitTestKind : unsigned {
32328 UndefBit,
32329 ConstantBit,
32330 NotConstantBit,
32331 ShiftBit,
32332 NotShiftBit
32333};
32334
32335static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
32336 using namespace llvm::PatternMatch;
32337 BitTestKind BTK = UndefBit;
32338 auto *C = dyn_cast<ConstantInt>(V);
32339 if (C) {
32340 // Check if V is a power of 2 or NOT power of 2.
32341 if (isPowerOf2_64(C->getZExtValue()))
32342 BTK = ConstantBit;
32343 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
32344 BTK = NotConstantBit;
32345 return {V, BTK};
32346 }
32347
32348 // Check if V is some power of 2 pattern known to be non-zero
32349 auto *I = dyn_cast<Instruction>(V);
32350 if (I) {
32351 bool Not = false;
32352 // Check if we have a NOT
32353 Value *PeekI;
32354 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
32355 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
32356 Not = true;
32357 I = dyn_cast<Instruction>(PeekI);
32358
32359 // If I is constant, it will fold and we can evaluate later. If its an
32360 // argument or something of that nature, we can't analyze.
32361 if (I == nullptr)
32362 return {nullptr, UndefBit};
32363 }
32364 // We can only use 1 << X without more sophisticated analysis. C << X where
32365 // C is a power of 2 but not 1 can result in zero which cannot be translated
32366 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
32367 if (I->getOpcode() == Instruction::Shl) {
32368 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
32369 // -X` and some other provable power of 2 patterns that we can use CTZ on
32370 // may be profitable.
32371 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
32372 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
32373 // be provably a non-zero power of 2.
32374 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
32375 // transformable to bittest.
32376 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
32377 if (!ShiftVal)
32378 return {nullptr, UndefBit};
32379 if (ShiftVal->equalsInt(1))
32380 BTK = Not ? NotShiftBit : ShiftBit;
32381
32382 if (BTK == UndefBit)
32383 return {nullptr, UndefBit};
32384
32385 Value *BitV = I->getOperand(1);
32386
32387 Value *AndOp;
32388 const APInt *AndC;
32389 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
32390 // Read past a shiftmask instruction to find count
32391 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
32392 BitV = AndOp;
32393 }
32394 return {BitV, BTK};
32395 }
32396 }
32397 return {nullptr, UndefBit};
32398}
32399
32400TargetLowering::AtomicExpansionKind
32401X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
32402 using namespace llvm::PatternMatch;
32403 // If the atomicrmw's result isn't actually used, we can just add a "lock"
32404 // prefix to a normal instruction for these operations.
32405 if (AI->use_empty())
32406 return AtomicExpansionKind::None;
32407
32408 if (AI->getOperation() == AtomicRMWInst::Xor) {
32409 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
32410 // preferable to both `cmpxchg` and `btc`.
32411 if (match(AI->getOperand(1), m_SignMask()))
32412 return AtomicExpansionKind::None;
32413 }
32414
32415 // If the atomicrmw's result is used by a single bit AND, we may use
32416 // bts/btr/btc instruction for these operations.
32417 // Note: InstCombinePass can cause a de-optimization here. It replaces the
32418 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
32419 // (depending on CC). This pattern can only use bts/btr/btc but we don't
32420 // detect it.
32421 Instruction *I = AI->user_back();
32422 auto BitChange = FindSingleBitChange(AI->getValOperand());
32423 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
32424 I->getOpcode() != Instruction::And ||
32425 AI->getType()->getPrimitiveSizeInBits() == 8 ||
32426 AI->getParent() != I->getParent())
32427 return AtomicExpansionKind::CmpXChg;
32428
32429 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
32430
32431 // This is a redundant AND, it should get cleaned up elsewhere.
32432 if (AI == I->getOperand(OtherIdx))
32433 return AtomicExpansionKind::CmpXChg;
32434
32435 // The following instruction must be a AND single bit.
32436 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
32437 auto *C1 = cast<ConstantInt>(AI->getValOperand());
32438 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
32439 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
32440 return AtomicExpansionKind::CmpXChg;
32441 }
32442 if (AI->getOperation() == AtomicRMWInst::And) {
32443 return ~C1->getValue() == C2->getValue()
32444 ? AtomicExpansionKind::BitTestIntrinsic
32445 : AtomicExpansionKind::CmpXChg;
32446 }
32447 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
32448 : AtomicExpansionKind::CmpXChg;
32449 }
32450
32451 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32451, __extension__
__PRETTY_FUNCTION__))
;
32452
32453 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
32454 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
32455 return AtomicExpansionKind::CmpXChg;
32456
32457 assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32457, __extension__
__PRETTY_FUNCTION__))
;
32458
32459 // If shift amounts are not the same we can't use BitTestIntrinsic.
32460 if (BitChange.first != BitTested.first)
32461 return AtomicExpansionKind::CmpXChg;
32462
32463 // If atomic AND need to be masking all be one bit and testing the one bit
32464 // unset in the mask.
32465 if (AI->getOperation() == AtomicRMWInst::And)
32466 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
32467 ? AtomicExpansionKind::BitTestIntrinsic
32468 : AtomicExpansionKind::CmpXChg;
32469
32470 // If atomic XOR/OR need to be setting and testing the same bit.
32471 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
32472 ? AtomicExpansionKind::BitTestIntrinsic
32473 : AtomicExpansionKind::CmpXChg;
32474}
32475
32476void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
32477 IRBuilder<> Builder(AI);
32478 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32479 Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
32480 Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
32481 switch (AI->getOperation()) {
32482 default:
32483 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32483)
;
32484 case AtomicRMWInst::Or:
32485 IID_C = Intrinsic::x86_atomic_bts;
32486 IID_I = Intrinsic::x86_atomic_bts_rm;
32487 break;
32488 case AtomicRMWInst::Xor:
32489 IID_C = Intrinsic::x86_atomic_btc;
32490 IID_I = Intrinsic::x86_atomic_btc_rm;
32491 break;
32492 case AtomicRMWInst::And:
32493 IID_C = Intrinsic::x86_atomic_btr;
32494 IID_I = Intrinsic::x86_atomic_btr_rm;
32495 break;
32496 }
32497 Instruction *I = AI->user_back();
32498 LLVMContext &Ctx = AI->getContext();
32499 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32500 Type::getInt8PtrTy(Ctx));
32501 Function *BitTest = nullptr;
32502 Value *Result = nullptr;
32503 auto BitTested = FindSingleBitChange(AI->getValOperand());
32504 assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32504, __extension__ __PRETTY_FUNCTION__))
;
32505
32506 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
32507 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
32508
32509 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
32510
32511 unsigned Imm = llvm::countr_zero(C->getZExtValue());
32512 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
32513 } else {
32514 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
32515
32516 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32516, __extension__
__PRETTY_FUNCTION__))
;
32517
32518 Value *SI = BitTested.first;
32519 assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
32519, __extension__ __PRETTY_FUNCTION__))
;
32520
32521 // BT{S|R|C} on memory operand don't modulo bit position so we need to
32522 // mask it.
32523 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
32524 Value *BitPos =
32525 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
32526 // Todo(1): In many cases it may be provable that SI is less than
32527 // ShiftBits in which case this mask is unnecessary
32528 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
32529 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
32530 // favor of just a raw BT{S|R|C}.
32531
32532 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
32533 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
32534
32535 // If the result is only used for zero/non-zero status then we don't need to
32536 // shift value back. Otherwise do so.
32537 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
32538 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
32539 if (ICmp->isEquality()) {
32540 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
32541 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
32542 if (C0 || C1) {
32543 assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32543, __extension__ __PRETTY_FUNCTION__))
;
32544 if ((C0 ? C0 : C1)->isZero())
32545 continue;
32546 }
32547 }
32548 }
32549 Result = Builder.CreateShl(Result, BitPos);
32550 break;
32551 }
32552 }
32553
32554 I->replaceAllUsesWith(Result);
32555 I->eraseFromParent();
32556 AI->eraseFromParent();
32557}
32558
32559static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
32560 using namespace llvm::PatternMatch;
32561 if (!AI->hasOneUse())
32562 return false;
32563
32564 Value *Op = AI->getOperand(1);
32565 ICmpInst::Predicate Pred;
32566 Instruction *I = AI->user_back();
32567 AtomicRMWInst::BinOp Opc = AI->getOperation();
32568 if (Opc == AtomicRMWInst::Add) {
32569 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32570 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32571 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32572 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32573 return Pred == CmpInst::ICMP_SLT;
32574 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32575 return Pred == CmpInst::ICMP_SGT;
32576 }
32577 return false;
32578 }
32579 if (Opc == AtomicRMWInst::Sub) {
32580 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32581 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32582 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32583 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32584 return Pred == CmpInst::ICMP_SLT;
32585 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32586 return Pred == CmpInst::ICMP_SGT;
32587 }
32588 return false;
32589 }
32590 if ((Opc == AtomicRMWInst::Or &&
32591 match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
32592 (Opc == AtomicRMWInst::And &&
32593 match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
32594 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32595 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32596 Pred == CmpInst::ICMP_SLT;
32597 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32598 return Pred == CmpInst::ICMP_SGT;
32599 return false;
32600 }
32601 if (Opc == AtomicRMWInst::Xor) {
32602 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32603 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32604 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32605 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32606 return Pred == CmpInst::ICMP_SLT;
32607 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32608 return Pred == CmpInst::ICMP_SGT;
32609 }
32610 return false;
32611 }
32612
32613 return false;
32614}
32615
32616void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32617 AtomicRMWInst *AI) const {
32618 IRBuilder<> Builder(AI);
32619 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32620 Instruction *TempI = nullptr;
32621 LLVMContext &Ctx = AI->getContext();
32622 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32623 if (!ICI) {
32624 TempI = AI->user_back();
32625 assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32625, __extension__
__PRETTY_FUNCTION__))
;
32626 ICI = cast<ICmpInst>(TempI->user_back());
32627 }
32628 X86::CondCode CC = X86::COND_INVALID;
32629 ICmpInst::Predicate Pred = ICI->getPredicate();
32630 switch (Pred) {
32631 default:
32632 llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32632)
;
32633 case CmpInst::ICMP_EQ:
32634 CC = X86::COND_E;
32635 break;
32636 case CmpInst::ICMP_NE:
32637 CC = X86::COND_NE;
32638 break;
32639 case CmpInst::ICMP_SLT:
32640 CC = X86::COND_S;
32641 break;
32642 case CmpInst::ICMP_SGT:
32643 CC = X86::COND_NS;
32644 break;
32645 }
32646 Intrinsic::ID IID = Intrinsic::not_intrinsic;
32647 switch (AI->getOperation()) {
32648 default:
32649 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32649)
;
32650 case AtomicRMWInst::Add:
32651 IID = Intrinsic::x86_atomic_add_cc;
32652 break;
32653 case AtomicRMWInst::Sub:
32654 IID = Intrinsic::x86_atomic_sub_cc;
32655 break;
32656 case AtomicRMWInst::Or:
32657 IID = Intrinsic::x86_atomic_or_cc;
32658 break;
32659 case AtomicRMWInst::And:
32660 IID = Intrinsic::x86_atomic_and_cc;
32661 break;
32662 case AtomicRMWInst::Xor:
32663 IID = Intrinsic::x86_atomic_xor_cc;
32664 break;
32665 }
32666 Function *CmpArith =
32667 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
32668 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32669 Type::getInt8PtrTy(Ctx));
32670 Value *Call = Builder.CreateCall(
32671 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32672 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32673 ICI->replaceAllUsesWith(Result);
32674 ICI->eraseFromParent();
32675 if (TempI)
32676 TempI->eraseFromParent();
32677 AI->eraseFromParent();
32678}
32679
32680TargetLowering::AtomicExpansionKind
32681X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32682 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32683 Type *MemType = AI->getType();
32684
32685 // If the operand is too big, we must see if cmpxchg8/16b is available
32686 // and default to library calls otherwise.
32687 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32688 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32689 : AtomicExpansionKind::None;
32690 }
32691
32692 AtomicRMWInst::BinOp Op = AI->getOperation();
32693 switch (Op) {
32694 case AtomicRMWInst::Xchg:
32695 return AtomicExpansionKind::None;
32696 case AtomicRMWInst::Add:
32697 case AtomicRMWInst::Sub:
32698 if (shouldExpandCmpArithRMWInIR(AI))
32699 return AtomicExpansionKind::CmpArithIntrinsic;
32700 // It's better to use xadd, xsub or xchg for these in other cases.
32701 return AtomicExpansionKind::None;
32702 case AtomicRMWInst::Or:
32703 case AtomicRMWInst::And:
32704 case AtomicRMWInst::Xor:
32705 if (shouldExpandCmpArithRMWInIR(AI))
32706 return AtomicExpansionKind::CmpArithIntrinsic;
32707 return shouldExpandLogicAtomicRMWInIR(AI);
32708 case AtomicRMWInst::Nand:
32709 case AtomicRMWInst::Max:
32710 case AtomicRMWInst::Min:
32711 case AtomicRMWInst::UMax:
32712 case AtomicRMWInst::UMin:
32713 case AtomicRMWInst::FAdd:
32714 case AtomicRMWInst::FSub:
32715 case AtomicRMWInst::FMax:
32716 case AtomicRMWInst::FMin:
32717 case AtomicRMWInst::UIncWrap:
32718 case AtomicRMWInst::UDecWrap:
32719 default:
32720 // These always require a non-trivial set of data operations on x86. We must
32721 // use a cmpxchg loop.
32722 return AtomicExpansionKind::CmpXChg;
32723 }
32724}
32725
32726LoadInst *
32727X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32728 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32729 Type *MemType = AI->getType();
32730 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32731 // there is no benefit in turning such RMWs into loads, and it is actually
32732 // harmful as it introduces a mfence.
32733 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32734 return nullptr;
32735
32736 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32737 // lowering available in lowerAtomicArith.
32738 // TODO: push more cases through this path.
32739 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32740 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32741 AI->use_empty())
32742 return nullptr;
32743
32744 IRBuilder<> Builder(AI);
32745 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32746 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
32747 auto SSID = AI->getSyncScopeID();
32748 // We must restrict the ordering to avoid generating loads with Release or
32749 // ReleaseAcquire orderings.
32750 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
32751
32752 // Before the load we need a fence. Here is an example lifted from
32753 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32754 // is required:
32755 // Thread 0:
32756 // x.store(1, relaxed);
32757 // r1 = y.fetch_add(0, release);
32758 // Thread 1:
32759 // y.fetch_add(42, acquire);
32760 // r2 = x.load(relaxed);
32761 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32762 // lowered to just a load without a fence. A mfence flushes the store buffer,
32763 // making the optimization clearly correct.
32764 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32765 // otherwise, we might be able to be more aggressive on relaxed idempotent
32766 // rmw. In practice, they do not look useful, so we don't try to be
32767 // especially clever.
32768 if (SSID == SyncScope::SingleThread)
32769 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
32770 // the IR level, so we must wrap it in an intrinsic.
32771 return nullptr;
32772
32773 if (!Subtarget.hasMFence())
32774 // FIXME: it might make sense to use a locked operation here but on a
32775 // different cache-line to prevent cache-line bouncing. In practice it
32776 // is probably a small win, and x86 processors without mfence are rare
32777 // enough that we do not bother.
32778 return nullptr;
32779
32780 Function *MFence =
32781 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
32782 Builder.CreateCall(MFence, {});
32783
32784 // Finally we can emit the atomic load.
32785 LoadInst *Loaded = Builder.CreateAlignedLoad(
32786 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32787 Loaded->setAtomic(Order, SSID);
32788 AI->replaceAllUsesWith(Loaded);
32789 AI->eraseFromParent();
32790 return Loaded;
32791}
32792
32793bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
32794 if (!SI.isUnordered())
32795 return false;
32796 return ExperimentalUnorderedISEL;
32797}
32798bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
32799 if (!LI.isUnordered())
32800 return false;
32801 return ExperimentalUnorderedISEL;
32802}
32803
32804
32805/// Emit a locked operation on a stack location which does not change any
32806/// memory location, but does involve a lock prefix. Location is chosen to be
32807/// a) very likely accessed only by a single thread to minimize cache traffic,
32808/// and b) definitely dereferenceable. Returns the new Chain result.
32809static SDValue emitLockedStackOp(SelectionDAG &DAG,
32810 const X86Subtarget &Subtarget, SDValue Chain,
32811 const SDLoc &DL) {
32812 // Implementation notes:
32813 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32814 // operations issued by the current processor. As such, the location
32815 // referenced is not relevant for the ordering properties of the instruction.
32816 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32817 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32818 // 2) Using an immediate operand appears to be the best encoding choice
32819 // here since it doesn't require an extra register.
32820 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32821 // is small enough it might just be measurement noise.)
32822 // 4) When choosing offsets, there are several contributing factors:
32823 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32824 // line aligned stack object to improve this case.)
32825 // b) To minimize our chances of introducing a false dependence, we prefer
32826 // to offset the stack usage from TOS slightly.
32827 // c) To minimize concerns about cross thread stack usage - in particular,
32828 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32829 // captures state in the TOS frame and accesses it from many threads -
32830 // we want to use an offset such that the offset is in a distinct cache
32831 // line from the TOS frame.
32832 //
32833 // For a general discussion of the tradeoffs and benchmark results, see:
32834 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32835
32836 auto &MF = DAG.getMachineFunction();
32837 auto &TFL = *Subtarget.getFrameLowering();
32838 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32839
32840 if (Subtarget.is64Bit()) {
32841 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32842 SDValue Ops[] = {
32843 DAG.getRegister(X86::RSP, MVT::i64), // Base
32844 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32845 DAG.getRegister(0, MVT::i64), // Index
32846 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32847 DAG.getRegister(0, MVT::i16), // Segment.
32848 Zero,
32849 Chain};
32850 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32851 MVT::Other, Ops);
32852 return SDValue(Res, 1);
32853 }
32854
32855 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32856 SDValue Ops[] = {
32857 DAG.getRegister(X86::ESP, MVT::i32), // Base
32858 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32859 DAG.getRegister(0, MVT::i32), // Index
32860 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32861 DAG.getRegister(0, MVT::i16), // Segment.
32862 Zero,
32863 Chain
32864 };
32865 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32866 MVT::Other, Ops);
32867 return SDValue(Res, 1);
32868}
32869
32870static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
32871 SelectionDAG &DAG) {
32872 SDLoc dl(Op);
32873 AtomicOrdering FenceOrdering =
32874 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32875 SyncScope::ID FenceSSID =
32876 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32877
32878 // The only fence that needs an instruction is a sequentially-consistent
32879 // cross-thread fence.
32880 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32881 FenceSSID == SyncScope::System) {
32882 if (Subtarget.hasMFence())
32883 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32884
32885 SDValue Chain = Op.getOperand(0);
32886 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32887 }
32888
32889 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32890 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32891}
32892
32893static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
32894 SelectionDAG &DAG) {
32895 MVT T = Op.getSimpleValueType();
32896 SDLoc DL(Op);
32897 unsigned Reg = 0;
32898 unsigned size = 0;
32899 switch(T.SimpleTy) {
32900 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32900)
;
32901 case MVT::i8: Reg = X86::AL; size = 1; break;
32902 case MVT::i16: Reg = X86::AX; size = 2; break;
32903 case MVT::i32: Reg = X86::EAX; size = 4; break;
32904 case MVT::i64:
32905 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32905, __extension__
__PRETTY_FUNCTION__))
;
32906 Reg = X86::RAX; size = 8;
32907 break;
32908 }
32909 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32910 Op.getOperand(2), SDValue());
32911 SDValue Ops[] = { cpIn.getValue(0),
32912 Op.getOperand(1),
32913 Op.getOperand(3),
32914 DAG.getTargetConstant(size, DL, MVT::i8),
32915 cpIn.getValue(1) };
32916 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32917 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32918 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
32919 Ops, T, MMO);
32920
32921 SDValue cpOut =
32922 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32923 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32924 MVT::i32, cpOut.getValue(2));
32925 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32926
32927 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32928 cpOut, Success, EFLAGS.getValue(1));
32929}
32930
32931// Create MOVMSKB, taking into account whether we need to split for AVX1.
32932static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
32933 const X86Subtarget &Subtarget) {
32934 MVT InVT = V.getSimpleValueType();
32935
32936 if (InVT == MVT::v64i8) {
32937 SDValue Lo, Hi;
32938 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32939 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32940 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32941 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32942 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32943 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32944 DAG.getConstant(32, DL, MVT::i8));
32945 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32946 }
32947 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32948 SDValue Lo, Hi;
32949 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32950 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32951 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32952 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32953 DAG.getConstant(16, DL, MVT::i8));
32954 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32955 }
32956
32957 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32958}
32959
32960static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32961 SelectionDAG &DAG) {
32962 SDValue Src = Op.getOperand(0);
32963 MVT SrcVT = Src.getSimpleValueType();
32964 MVT DstVT = Op.getSimpleValueType();
32965
32966 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32967 // half to v32i1 and concatenating the result.
32968 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32969 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32969, __extension__
__PRETTY_FUNCTION__))
;
32970 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32970, __extension__
__PRETTY_FUNCTION__))
;
32971 SDLoc dl(Op);
32972 SDValue Lo, Hi;
32973 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32974 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32975 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32976 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32977 }
32978
32979 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32980 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32981 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__
__PRETTY_FUNCTION__))
;
32982 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32983 SDLoc DL(Op);
32984 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32985 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32986 return DAG.getZExtOrTrunc(V, DL, DstVT);
32987 }
32988
32989 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__
__PRETTY_FUNCTION__))
32990 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__
__PRETTY_FUNCTION__))
;
32991
32992 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32992, __extension__
__PRETTY_FUNCTION__))
;
32993 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32994 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32995 // This conversion needs to be expanded.
32996 return SDValue();
32997
32998 SDLoc dl(Op);
32999 if (SrcVT.isVector()) {
33000 // Widen the vector in input in the case of MVT::v2i32.
33001 // Example: from MVT::v2i32 to MVT::v4i32.
33002 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
33003 SrcVT.getVectorNumElements() * 2);
33004 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
33005 DAG.getUNDEF(SrcVT));
33006 } else {
33007 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__
__PRETTY_FUNCTION__))
33008 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__
__PRETTY_FUNCTION__))
;
33009 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
33010 }
33011
33012 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
33013 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
33014
33015 if (DstVT == MVT::x86mmx)
33016 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
33017
33018 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
33019 DAG.getIntPtrConstant(0, dl));
33020}
33021
33022/// Compute the horizontal sum of bytes in V for the elements of VT.
33023///
33024/// Requires V to be a byte vector and VT to be an integer vector type with
33025/// wider elements than V's type. The width of the elements of VT determines
33026/// how many bytes of V are summed horizontally to produce each element of the
33027/// result.
33028static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
33029 const X86Subtarget &Subtarget,
33030 SelectionDAG &DAG) {
33031 SDLoc DL(V);
33032 MVT ByteVecVT = V.getSimpleValueType();
33033 MVT EltVT = VT.getVectorElementType();
33034 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__
__PRETTY_FUNCTION__))
33035 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__
__PRETTY_FUNCTION__))
;
33036 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__
__PRETTY_FUNCTION__))
33037 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__
__PRETTY_FUNCTION__))
;
33038 unsigned VecSize = VT.getSizeInBits();
33039 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33039, __extension__
__PRETTY_FUNCTION__))
;
33040
33041 // PSADBW instruction horizontally add all bytes and leave the result in i64
33042 // chunks, thus directly computes the pop count for v2i64 and v4i64.
33043 if (EltVT == MVT::i64) {
33044 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
33045 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
33046 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
33047 return DAG.getBitcast(VT, V);
33048 }
33049
33050 if (EltVT == MVT::i32) {
33051 // We unpack the low half and high half into i32s interleaved with zeros so
33052 // that we can use PSADBW to horizontally sum them. The most useful part of
33053 // this is that it lines up the results of two PSADBW instructions to be
33054 // two v2i64 vectors which concatenated are the 4 population counts. We can
33055 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
33056 SDValue Zeros = DAG.getConstant(0, DL, VT);
33057 SDValue V32 = DAG.getBitcast(VT, V);
33058 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
33059 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
33060
33061 // Do the horizontal sums into two v2i64s.
33062 Zeros = DAG.getConstant(0, DL, ByteVecVT);
33063 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
33064 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
33065 DAG.getBitcast(ByteVecVT, Low), Zeros);
33066 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
33067 DAG.getBitcast(ByteVecVT, High), Zeros);
33068
33069 // Merge them together.
33070 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
33071 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
33072 DAG.getBitcast(ShortVecVT, Low),
33073 DAG.getBitcast(ShortVecVT, High));
33074
33075 return DAG.getBitcast(VT, V);
33076 }
33077
33078 // The only element type left is i16.
33079 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33079, __extension__
__PRETTY_FUNCTION__))
;
33080
33081 // To obtain pop count for each i16 element starting from the pop count for
33082 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
33083 // right by 8. It is important to shift as i16s as i8 vector shift isn't
33084 // directly supported.
33085 SDValue ShifterV = DAG.getConstant(8, DL, VT);
33086 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
33087 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
33088 DAG.getBitcast(ByteVecVT, V));
33089 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
33090}
33091
33092static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
33093 const X86Subtarget &Subtarget,
33094 SelectionDAG &DAG) {
33095 MVT VT = Op.getSimpleValueType();
33096 MVT EltVT = VT.getVectorElementType();
33097 int NumElts = VT.getVectorNumElements();
33098 (void)EltVT;
33099 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33099, __extension__
__PRETTY_FUNCTION__))
;
33100
33101 // Implement a lookup table in register by using an algorithm based on:
33102 // http://wm.ite.pl/articles/sse-popcount.html
33103 //
33104 // The general idea is that every lower byte nibble in the input vector is an
33105 // index into a in-register pre-computed pop count table. We then split up the
33106 // input vector in two new ones: (1) a vector with only the shifted-right
33107 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
33108 // masked out higher ones) for each byte. PSHUFB is used separately with both
33109 // to index the in-register table. Next, both are added and the result is a
33110 // i8 vector where each element contains the pop count for input byte.
33111 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
33112 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
33113 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
33114 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
33115
33116 SmallVector<SDValue, 64> LUTVec;
33117 for (int i = 0; i < NumElts; ++i)
33118 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
33119 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
33120 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
33121
33122 // High nibbles
33123 SDValue FourV = DAG.getConstant(4, DL, VT);
33124 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
33125
33126 // Low nibbles
33127 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
33128
33129 // The input vector is used as the shuffle mask that index elements into the
33130 // LUT. After counting low and high nibbles, add the vector to obtain the
33131 // final pop count per i8 element.
33132 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
33133 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
33134 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
33135}
33136
33137// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
33138// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
33139static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
33140 SelectionDAG &DAG) {
33141 MVT VT = Op.getSimpleValueType();
33142 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__
__PRETTY_FUNCTION__))
33143 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__
__PRETTY_FUNCTION__))
;
33144 SDLoc DL(Op.getNode());
33145 SDValue Op0 = Op.getOperand(0);
33146
33147 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
33148 if (Subtarget.hasVPOPCNTDQ()) {
33149 unsigned NumElems = VT.getVectorNumElements();
33150 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__
__PRETTY_FUNCTION__))
33151 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__
__PRETTY_FUNCTION__))
;
33152 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
33153 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
33154 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
33155 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
33156 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
33157 }
33158 }
33159
33160 // Decompose 256-bit ops into smaller 128-bit ops.
33161 if (VT.is256BitVector() && !Subtarget.hasInt256())
33162 return splitVectorIntUnary(Op, DAG);
33163
33164 // Decompose 512-bit ops into smaller 256-bit ops.
33165 if (VT.is512BitVector() && !Subtarget.hasBWI())
33166 return splitVectorIntUnary(Op, DAG);
33167
33168 // For element types greater than i8, do vXi8 pop counts and a bytesum.
33169 if (VT.getScalarType() != MVT::i8) {
33170 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
33171 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
33172 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
33173 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
33174 }
33175
33176 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
33177 if (!Subtarget.hasSSSE3())
33178 return SDValue();
33179
33180 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
33181}
33182
33183static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
33184 SelectionDAG &DAG) {
33185 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__))
33186 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__))
;
33187 return LowerVectorCTPOP(Op, Subtarget, DAG);
33188}
33189
33190static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
33191 MVT VT = Op.getSimpleValueType();
33192 SDValue In = Op.getOperand(0);
33193 SDLoc DL(Op);
33194
33195 // For scalars, its still beneficial to transfer to/from the SIMD unit to
33196 // perform the BITREVERSE.
33197 if (!VT.isVector()) {
33198 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
33199 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
33200 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
33201 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
33202 DAG.getIntPtrConstant(0, DL));
33203 }
33204
33205 int NumElts = VT.getVectorNumElements();
33206 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
33207
33208 // Decompose 256-bit ops into smaller 128-bit ops.
33209 if (VT.is256BitVector())
33210 return splitVectorIntUnary(Op, DAG);
33211
33212 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__
__PRETTY_FUNCTION__))
33213 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__
__PRETTY_FUNCTION__))
;
33214
33215 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
33216 // perform the BSWAP in the shuffle.
33217 // Its best to shuffle using the second operand as this will implicitly allow
33218 // memory folding for multiple vectors.
33219 SmallVector<SDValue, 16> MaskElts;
33220 for (int i = 0; i != NumElts; ++i) {
33221 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
33222 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
33223 int PermuteByte = SourceByte | (2 << 5);
33224 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
33225 }
33226 }
33227
33228 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
33229 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
33230 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
33231 Res, Mask);
33232 return DAG.getBitcast(VT, Res);
33233}
33234
33235static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
33236 SelectionDAG &DAG) {
33237 MVT VT = Op.getSimpleValueType();
33238
33239 if (Subtarget.hasXOP() && !VT.is512BitVector())
33240 return LowerBITREVERSE_XOP(Op, DAG);
33241
33242 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33242, __extension__
__PRETTY_FUNCTION__))
;
33243
33244 SDValue In = Op.getOperand(0);
33245 SDLoc DL(Op);
33246
33247 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__
__PRETTY_FUNCTION__))
33248 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__
__PRETTY_FUNCTION__))
;
33249
33250 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
33251 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
33252 return splitVectorIntUnary(Op, DAG);
33253
33254 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
33255 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
33256 return splitVectorIntUnary(Op, DAG);
33257
33258 unsigned NumElts = VT.getVectorNumElements();
33259
33260 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
33261 if (Subtarget.hasGFNI()) {
33262 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
33263 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
33264 Matrix = DAG.getBitcast(VT, Matrix);
33265 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
33266 DAG.getTargetConstant(0, DL, MVT::i8));
33267 }
33268
33269 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
33270 // two nibbles and a PSHUFB lookup to find the bitreverse of each
33271 // 0-15 value (moved to the other nibble).
33272 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
33273 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
33274 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
33275
33276 const int LoLUT[16] = {
33277 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
33278 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
33279 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
33280 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
33281 const int HiLUT[16] = {
33282 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
33283 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
33284 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
33285 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
33286
33287 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
33288 for (unsigned i = 0; i < NumElts; ++i) {
33289 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
33290 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
33291 }
33292
33293 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
33294 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
33295 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
33296 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
33297 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
33298}
33299
33300static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
33301 SelectionDAG &DAG) {
33302 SDLoc DL(Op);
33303 SDValue X = Op.getOperand(0);
33304 MVT VT = Op.getSimpleValueType();
33305
33306 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
33307 if (VT == MVT::i8 ||
33308 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
33309 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33310 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
33311 DAG.getConstant(0, DL, MVT::i8));
33312 // Copy the inverse of the parity flag into a register with setcc.
33313 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33314 // Extend to the original type.
33315 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33316 }
33317
33318 // If we have POPCNT, use the default expansion.
33319 if (Subtarget.hasPOPCNT())
33320 return SDValue();
33321
33322 if (VT == MVT::i64) {
33323 // Xor the high and low 16-bits together using a 32-bit operation.
33324 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
33325 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
33326 DAG.getConstant(32, DL, MVT::i8)));
33327 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
33328 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
33329 }
33330
33331 if (VT != MVT::i16) {
33332 // Xor the high and low 16-bits together using a 32-bit operation.
33333 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
33334 DAG.getConstant(16, DL, MVT::i8));
33335 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
33336 } else {
33337 // If the input is 16-bits, we need to extend to use an i32 shift below.
33338 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
33339 }
33340
33341 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
33342 // This should allow an h-reg to be used to save a shift.
33343 SDValue Hi = DAG.getNode(
33344 ISD::TRUNCATE, DL, MVT::i8,
33345 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
33346 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33347 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
33348 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
33349
33350 // Copy the inverse of the parity flag into a register with setcc.
33351 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33352 // Extend to the original type.
33353 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33354}
33355
33356static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
33357 const X86Subtarget &Subtarget) {
33358 unsigned NewOpc = 0;
33359 switch (N->getOpcode()) {
33360 case ISD::ATOMIC_LOAD_ADD:
33361 NewOpc = X86ISD::LADD;
33362 break;
33363 case ISD::ATOMIC_LOAD_SUB:
33364 NewOpc = X86ISD::LSUB;
33365 break;
33366 case ISD::ATOMIC_LOAD_OR:
33367 NewOpc = X86ISD::LOR;
33368 break;
33369 case ISD::ATOMIC_LOAD_XOR:
33370 NewOpc = X86ISD::LXOR;
33371 break;
33372 case ISD::ATOMIC_LOAD_AND:
33373 NewOpc = X86ISD::LAND;
33374 break;
33375 default:
33376 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33376)
;
33377 }
33378
33379 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33380
33381 return DAG.getMemIntrinsicNode(
33382 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
33383 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
33384 /*MemVT=*/N->getSimpleValueType(0), MMO);
33385}
33386
33387/// Lower atomic_load_ops into LOCK-prefixed operations.
33388static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
33389 const X86Subtarget &Subtarget) {
33390 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
33391 SDValue Chain = N->getOperand(0);
33392 SDValue LHS = N->getOperand(1);
33393 SDValue RHS = N->getOperand(2);
33394 unsigned Opc = N->getOpcode();
33395 MVT VT = N->getSimpleValueType(0);
33396 SDLoc DL(N);
33397
33398 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
33399 // can only be lowered when the result is unused. They should have already
33400 // been transformed into a cmpxchg loop in AtomicExpand.
33401 if (N->hasAnyUseOfValue(0)) {
33402 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
33403 // select LXADD if LOCK_SUB can't be selected.
33404 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
33405 // can use LXADD as opposed to cmpxchg.
33406 if (Opc == ISD::ATOMIC_LOAD_SUB ||
33407 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) {
33408 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
33409 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,
33410 AN->getMemOperand());
33411 }
33412 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__
__PRETTY_FUNCTION__))
33413 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__
__PRETTY_FUNCTION__))
;
33414 return N;
33415 }
33416
33417 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
33418 // The core idea here is that since the memory location isn't actually
33419 // changing, all we need is a lowering for the *ordering* impacts of the
33420 // atomicrmw. As such, we can chose a different operation and memory
33421 // location to minimize impact on other code.
33422 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
33423 // On X86, the only ordering which actually requires an instruction is
33424 // seq_cst which isn't SingleThread, everything just needs to be preserved
33425 // during codegen and then dropped. Note that we expect (but don't assume),
33426 // that orderings other than seq_cst and acq_rel have been canonicalized to
33427 // a store or load.
33428 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
33429 AN->getSyncScopeID() == SyncScope::System) {
33430 // Prefer a locked operation against a stack location to minimize cache
33431 // traffic. This assumes that stack locations are very likely to be
33432 // accessed only by the owning thread.
33433 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
33434 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33434, __extension__ __PRETTY_FUNCTION__))
;
33435 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33436 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33437 DAG.getUNDEF(VT), NewChain);
33438 }
33439 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
33440 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
33441 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33441, __extension__ __PRETTY_FUNCTION__))
;
33442 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33443 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33444 DAG.getUNDEF(VT), NewChain);
33445 }
33446
33447 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
33448 // RAUW the chain, but don't worry about the result, as it's unused.
33449 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33449, __extension__ __PRETTY_FUNCTION__))
;
33450 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33451 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33452 DAG.getUNDEF(VT), LockOp.getValue(1));
33453}
33454
33455static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
33456 const X86Subtarget &Subtarget) {
33457 auto *Node = cast<AtomicSDNode>(Op.getNode());
33458 SDLoc dl(Node);
33459 EVT VT = Node->getMemoryVT();
33460
33461 bool IsSeqCst =
33462 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
33463 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
33464
33465 // If this store is not sequentially consistent and the type is legal
33466 // we can just keep it.
33467 if (!IsSeqCst && IsTypeLegal)
33468 return Op;
33469
33470 if (VT == MVT::i64 && !IsTypeLegal) {
33471 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33472 // is enabled.
33473 bool NoImplicitFloatOps =
33474 DAG.getMachineFunction().getFunction().hasFnAttribute(
33475 Attribute::NoImplicitFloat);
33476 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33477 SDValue Chain;
33478 if (Subtarget.hasSSE1()) {
33479 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
33480 Node->getOperand(2));
33481 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33482 SclToVec = DAG.getBitcast(StVT, SclToVec);
33483 SDVTList Tys = DAG.getVTList(MVT::Other);
33484 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33485 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33486 MVT::i64, Node->getMemOperand());
33487 } else if (Subtarget.hasX87()) {
33488 // First load this into an 80-bit X87 register using a stack temporary.
33489 // This will put the whole integer into the significand.
33490 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33491 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33492 MachinePointerInfo MPI =
33493 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
33494 Chain =
33495 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
33496 MPI, MaybeAlign(), MachineMemOperand::MOStore);
33497 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33498 SDValue LdOps[] = {Chain, StackPtr};
33499 SDValue Value = DAG.getMemIntrinsicNode(
33500 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33501 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33502 Chain = Value.getValue(1);
33503
33504 // Now use an FIST to do the atomic store.
33505 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33506 Chain =
33507 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33508 StoreOps, MVT::i64, Node->getMemOperand());
33509 }
33510
33511 if (Chain) {
33512 // If this is a sequentially consistent store, also emit an appropriate
33513 // barrier.
33514 if (IsSeqCst)
33515 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33516
33517 return Chain;
33518 }
33519 }
33520 }
33521
33522 // Convert seq_cst store -> xchg
33523 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33524 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33525 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
33526 Node->getMemoryVT(),
33527 Node->getOperand(0),
33528 Node->getOperand(1), Node->getOperand(2),
33529 Node->getMemOperand());
33530 return Swap.getValue(1);
33531}
33532
33533static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
33534 SDNode *N = Op.getNode();
33535 MVT VT = N->getSimpleValueType(0);
33536 unsigned Opc = Op.getOpcode();
33537
33538 // Let legalize expand this if it isn't a legal type yet.
33539 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33540 return SDValue();
33541
33542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33543 SDLoc DL(N);
33544
33545 // Set the carry flag.
33546 SDValue Carry = Op.getOperand(2);
33547 EVT CarryVT = Carry.getValueType();
33548 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33549 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33550
33551 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33552 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33553 Op.getOperand(0), Op.getOperand(1),
33554 Carry.getValue(1));
33555
33556 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33557 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33558 Sum.getValue(1), DL, DAG);
33559 if (N->getValueType(1) == MVT::i1)
33560 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33561
33562 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33563}
33564
33565static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33566 SelectionDAG &DAG) {
33567 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33567, __extension__
__PRETTY_FUNCTION__))
;
33568
33569 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33570 // which returns the values as { float, float } (in XMM0) or
33571 // { double, double } (which is returned in XMM0, XMM1).
33572 SDLoc dl(Op);
33573 SDValue Arg = Op.getOperand(0);
33574 EVT ArgVT = Arg.getValueType();
33575 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33576
33577 TargetLowering::ArgListTy Args;
33578 TargetLowering::ArgListEntry Entry;
33579
33580 Entry.Node = Arg;
33581 Entry.Ty = ArgTy;
33582 Entry.IsSExt = false;
33583 Entry.IsZExt = false;
33584 Args.push_back(Entry);
33585
33586 bool isF64 = ArgVT == MVT::f64;
33587 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33588 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33589 // the results are returned via SRet in memory.
33590 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33591 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33592 const char *LibcallName = TLI.getLibcallName(LC);
33593 SDValue Callee =
33594 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33595
33596 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33597 : (Type *)FixedVectorType::get(ArgTy, 4);
33598
33599 TargetLowering::CallLoweringInfo CLI(DAG);
33600 CLI.setDebugLoc(dl)
33601 .setChain(DAG.getEntryNode())
33602 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33603
33604 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33605
33606 if (isF64)
33607 // Returned in xmm0 and xmm1.
33608 return CallResult.first;
33609
33610 // Returned in bits 0:31 and 32:64 xmm0.
33611 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33612 CallResult.first, DAG.getIntPtrConstant(0, dl));
33613 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33614 CallResult.first, DAG.getIntPtrConstant(1, dl));
33615 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33616 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33617}
33618
33619/// Widen a vector input to a vector of NVT. The
33620/// input vector must have the same element type as NVT.
33621static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
33622 bool FillWithZeroes = false) {
33623 // Check if InOp already has the right width.
33624 MVT InVT = InOp.getSimpleValueType();
33625 if (InVT == NVT)
33626 return InOp;
33627
33628 if (InOp.isUndef())
33629 return DAG.getUNDEF(NVT);
33630
33631 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__
__PRETTY_FUNCTION__))
33632 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__
__PRETTY_FUNCTION__))
;
33633
33634 unsigned InNumElts = InVT.getVectorNumElements();
33635 unsigned WidenNumElts = NVT.getVectorNumElements();
33636 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__
__PRETTY_FUNCTION__))
33637 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__
__PRETTY_FUNCTION__))
;
33638
33639 SDLoc dl(InOp);
33640 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
33641 InOp.getNumOperands() == 2) {
33642 SDValue N1 = InOp.getOperand(1);
33643 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33644 N1.isUndef()) {
33645 InOp = InOp.getOperand(0);
33646 InVT = InOp.getSimpleValueType();
33647 InNumElts = InVT.getVectorNumElements();
33648 }
33649 }
33650 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
33651 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
33652 SmallVector<SDValue, 16> Ops;
33653 for (unsigned i = 0; i < InNumElts; ++i)
33654 Ops.push_back(InOp.getOperand(i));
33655
33656 EVT EltVT = InOp.getOperand(0).getValueType();
33657
33658 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
33659 DAG.getUNDEF(EltVT);
33660 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
33661 Ops.push_back(FillVal);
33662 return DAG.getBuildVector(NVT, dl, Ops);
33663 }
33664 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
33665 DAG.getUNDEF(NVT);
33666 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
33667 InOp, DAG.getIntPtrConstant(0, dl));
33668}
33669
33670static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
33671 SelectionDAG &DAG) {
33672 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__
__PRETTY_FUNCTION__))
33673 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__
__PRETTY_FUNCTION__))
;
33674
33675 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
33676 SDValue Src = N->getValue();
33677 MVT VT = Src.getSimpleValueType();
33678 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33678, __extension__
__PRETTY_FUNCTION__))
;
33679 SDLoc dl(Op);
33680
33681 SDValue Scale = N->getScale();
33682 SDValue Index = N->getIndex();
33683 SDValue Mask = N->getMask();
33684 SDValue Chain = N->getChain();
33685 SDValue BasePtr = N->getBasePtr();
33686
33687 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33688 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33688, __extension__
__PRETTY_FUNCTION__))
;
33689 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33690 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33691 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33692 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33693 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33694 SDVTList VTs = DAG.getVTList(MVT::Other);
33695 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33696 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33697 N->getMemoryVT(), N->getMemOperand());
33698 }
33699 return SDValue();
33700 }
33701
33702 MVT IndexVT = Index.getSimpleValueType();
33703
33704 // If the index is v2i32, we're being called by type legalization and we
33705 // should just let the default handling take care of it.
33706 if (IndexVT == MVT::v2i32)
33707 return SDValue();
33708
33709 // If we don't have VLX and neither the passthru or index is 512-bits, we
33710 // need to widen until one is.
33711 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33712 !Index.getSimpleValueType().is512BitVector()) {
33713 // Determine how much we need to widen by to get a 512-bit type.
33714 unsigned Factor = std::min(512/VT.getSizeInBits(),
33715 512/IndexVT.getSizeInBits());
33716 unsigned NumElts = VT.getVectorNumElements() * Factor;
33717
33718 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33719 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33720 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33721
33722 Src = ExtendToType(Src, VT, DAG);
33723 Index = ExtendToType(Index, IndexVT, DAG);
33724 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33725 }
33726
33727 SDVTList VTs = DAG.getVTList(MVT::Other);
33728 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33729 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33730 N->getMemoryVT(), N->getMemOperand());
33731}
33732
33733static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33734 SelectionDAG &DAG) {
33735
33736 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
33737 MVT VT = Op.getSimpleValueType();
33738 MVT ScalarVT = VT.getScalarType();
33739 SDValue Mask = N->getMask();
33740 MVT MaskVT = Mask.getSimpleValueType();
33741 SDValue PassThru = N->getPassThru();
33742 SDLoc dl(Op);
33743
33744 // Handle AVX masked loads which don't support passthru other than 0.
33745 if (MaskVT.getVectorElementType() != MVT::i1) {
33746 // We also allow undef in the isel pattern.
33747 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33748 return Op;
33749
33750 SDValue NewLoad = DAG.getMaskedLoad(
33751 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33752 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33753 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33754 N->isExpandingLoad());
33755 // Emit a blend.
33756 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33757 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33758 }
33759
33760 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__
__PRETTY_FUNCTION__))
33761 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__
__PRETTY_FUNCTION__))
;
33762
33763 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__
__PRETTY_FUNCTION__))
33764 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__
__PRETTY_FUNCTION__))
;
33765
33766 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__
__PRETTY_FUNCTION__))
33767 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__
__PRETTY_FUNCTION__))
;
33768
33769 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
33770 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
33771 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
33772 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
;
33773
33774 // This operation is legal for targets with VLX, but without
33775 // VLX the vector should be widened to 512 bit
33776 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33777 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33778 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33779
33780 // Mask element has to be i1.
33781 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__
__PRETTY_FUNCTION__))
33782 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__
__PRETTY_FUNCTION__))
;
33783
33784 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33785
33786 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33787 SDValue NewLoad = DAG.getMaskedLoad(
33788 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33789 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33790 N->getExtensionType(), N->isExpandingLoad());
33791
33792 SDValue Extract =
33793 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33794 DAG.getIntPtrConstant(0, dl));
33795 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33796 return DAG.getMergeValues(RetOps, dl);
33797}
33798
33799static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33800 SelectionDAG &DAG) {
33801 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
33802 SDValue DataToStore = N->getValue();
33803 MVT VT = DataToStore.getSimpleValueType();
33804 MVT ScalarVT = VT.getScalarType();
33805 SDValue Mask = N->getMask();
33806 SDLoc dl(Op);
33807
33808 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__
__PRETTY_FUNCTION__))
33809 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__
__PRETTY_FUNCTION__))
;
33810
33811 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__
__PRETTY_FUNCTION__))
33812 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__
__PRETTY_FUNCTION__))
;
33813
33814 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__
__PRETTY_FUNCTION__))
33815 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__
__PRETTY_FUNCTION__))
;
33816
33817 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
33818 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
33819 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
33820 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
;
33821
33822 // This operation is legal for targets with VLX, but without
33823 // VLX the vector should be widened to 512 bit
33824 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33825 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33826
33827 // Mask element has to be i1.
33828 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__
__PRETTY_FUNCTION__))
33829 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__
__PRETTY_FUNCTION__))
;
33830
33831 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33832
33833 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33834 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33835 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33836 N->getOffset(), Mask, N->getMemoryVT(),
33837 N->getMemOperand(), N->getAddressingMode(),
33838 N->isTruncatingStore(), N->isCompressingStore());
33839}
33840
33841static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33842 SelectionDAG &DAG) {
33843 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__
__PRETTY_FUNCTION__))
33844 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__
__PRETTY_FUNCTION__))
;
33845
33846 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33847 SDLoc dl(Op);
33848 MVT VT = Op.getSimpleValueType();
33849 SDValue Index = N->getIndex();
33850 SDValue Mask = N->getMask();
33851 SDValue PassThru = N->getPassThru();
33852 MVT IndexVT = Index.getSimpleValueType();
33853
33854 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33854, __extension__
__PRETTY_FUNCTION__))
;
33855
33856 // If the index is v2i32, we're being called by type legalization.
33857 if (IndexVT == MVT::v2i32)
33858 return SDValue();
33859
33860 // If we don't have VLX and neither the passthru or index is 512-bits, we
33861 // need to widen until one is.
33862 MVT OrigVT = VT;
33863 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33864 !IndexVT.is512BitVector()) {
33865 // Determine how much we need to widen by to get a 512-bit type.
33866 unsigned Factor = std::min(512/VT.getSizeInBits(),
33867 512/IndexVT.getSizeInBits());
33868
33869 unsigned NumElts = VT.getVectorNumElements() * Factor;
33870
33871 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33872 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33873 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33874
33875 PassThru = ExtendToType(PassThru, VT, DAG);
33876 Index = ExtendToType(Index, IndexVT, DAG);
33877 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33878 }
33879
33880 // Break dependency on the data register.
33881 if (PassThru.isUndef())
33882 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33883
33884 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33885 N->getScale() };
33886 SDValue NewGather = DAG.getMemIntrinsicNode(
33887 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33888 N->getMemOperand());
33889 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
33890 NewGather, DAG.getIntPtrConstant(0, dl));
33891 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33892}
33893
33894static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
33895 SDLoc dl(Op);
33896 SDValue Src = Op.getOperand(0);
33897 MVT DstVT = Op.getSimpleValueType();
33898
33899 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33900 unsigned SrcAS = N->getSrcAddressSpace();
33901
33902 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__
__PRETTY_FUNCTION__))
33903 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__
__PRETTY_FUNCTION__))
;
33904
33905 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33906 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33907 } else if (DstVT == MVT::i64) {
33908 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33909 } else if (DstVT == MVT::i32) {
33910 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33911 } else {
33912 report_fatal_error("Bad address space in addrspacecast");
33913 }
33914 return Op;
33915}
33916
33917SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33918 SelectionDAG &DAG) const {
33919 // TODO: Eventually, the lowering of these nodes should be informed by or
33920 // deferred to the GC strategy for the function in which they appear. For
33921 // now, however, they must be lowered to something. Since they are logically
33922 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33923 // require special handling for these nodes), lower them as literal NOOPs for
33924 // the time being.
33925 SmallVector<SDValue, 2> Ops;
33926 Ops.push_back(Op.getOperand(0));
33927 if (Op->getGluedNode())
33928 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33929
33930 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33931 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33932}
33933
33934// Custom split CVTPS2PH with wide types.
33935static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
33936 SDLoc dl(Op);
33937 EVT VT = Op.getValueType();
33938 SDValue Lo, Hi;
33939 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33940 EVT LoVT, HiVT;
33941 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33942 SDValue RC = Op.getOperand(1);
33943 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33944 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33945 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33946}
33947
33948static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
33949 unsigned OpNo) {
33950 const APInt Operand(32, OpNo);
33951 std::string OpNoStr = llvm::toString(Operand, 10, false);
33952 std::string Str(" $");
33953
33954 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33955 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33956
33957 auto I = StringRef::npos;
33958 for (auto &AsmStr : AsmStrs) {
33959 // Match the OpNo string. We should match exactly to exclude match
33960 // sub-string, e.g. "$12" contain "$1"
33961 if (AsmStr.endswith(OpNoStr1))
33962 I = AsmStr.size() - OpNoStr1.size();
33963
33964 // Get the index of operand in AsmStr.
33965 if (I == StringRef::npos)
33966 I = AsmStr.find(OpNoStr1 + ",");
33967 if (I == StringRef::npos)
33968 I = AsmStr.find(OpNoStr2);
33969
33970 if (I == StringRef::npos)
33971 continue;
33972
33973 assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33973, __extension__
__PRETTY_FUNCTION__))
;
33974 // Remove the operand string and label (if exsit).
33975 // For example:
33976 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33977 // ==>
33978 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33979 // ==>
33980 // "call dword ptr "
33981 auto TmpStr = AsmStr.substr(0, I);
33982 I = TmpStr.rfind(':');
33983 if (I != StringRef::npos)
33984 TmpStr = TmpStr.substr(I + 1);
33985 return TmpStr.take_while(llvm::isAlpha);
33986 }
33987
33988 return StringRef();
33989}
33990
33991bool X86TargetLowering::isInlineAsmTargetBranch(
33992 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33993 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33994 // changed from indirect TargetLowering::C_Memory to direct
33995 // TargetLowering::C_Address.
33996 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33997 // location.
33998 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33999 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
34000}
34001
34002/// Provide custom lowering hooks for some operations.
34003SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
34004 switch (Op.getOpcode()) {
34005 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34005)
;
34006 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
34007 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
34008 return LowerCMP_SWAP(Op, Subtarget, DAG);
34009 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
34010 case ISD::ATOMIC_LOAD_ADD:
34011 case ISD::ATOMIC_LOAD_SUB:
34012 case ISD::ATOMIC_LOAD_OR:
34013 case ISD::ATOMIC_LOAD_XOR:
34014 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
34015 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
34016 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
34017 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
34018 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
34019 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
34020 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
34021 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
34022 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
34023 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
34024 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
34025 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
34026 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
34027 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
34028 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
34029 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
34030 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
34031 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
34032 case ISD::SHL_PARTS:
34033 case ISD::SRA_PARTS:
34034 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
34035 case ISD::FSHL:
34036 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
34037 case ISD::STRICT_SINT_TO_FP:
34038 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
34039 case ISD::STRICT_UINT_TO_FP:
34040 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
34041 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
34042 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
34043 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
34044 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
34045 case ISD::ZERO_EXTEND_VECTOR_INREG:
34046 case ISD::SIGN_EXTEND_VECTOR_INREG:
34047 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
34048 case ISD::FP_TO_SINT:
34049 case ISD::STRICT_FP_TO_SINT:
34050 case ISD::FP_TO_UINT:
34051 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
34052 case ISD::FP_TO_SINT_SAT:
34053 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
34054 case ISD::FP_EXTEND:
34055 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
34056 case ISD::FP_ROUND:
34057 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
34058 case ISD::FP16_TO_FP:
34059 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
34060 case ISD::FP_TO_FP16:
34061 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
34062 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
34063 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
34064 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
34065 case ISD::FADD:
34066 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
34067 case ISD::FROUND: return LowerFROUND(Op, DAG);
34068 case ISD::FABS:
34069 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
34070 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
34071 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
34072 case ISD::LRINT:
34073 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
34074 case ISD::SETCC:
34075 case ISD::STRICT_FSETCC:
34076 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
34077 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
34078 case ISD::SELECT: return LowerSELECT(Op, DAG);
34079 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
34080 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
34081 case ISD::VASTART: return LowerVASTART(Op, DAG);
34082 case ISD::VAARG: return LowerVAARG(Op, DAG);
34083 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
34084 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
34085 case ISD::INTRINSIC_VOID:
34086 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
34087 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
34088 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
34089 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
34090 case ISD::FRAME_TO_ARGS_OFFSET:
34091 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
34092 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
34093 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
34094 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
34095 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
34096 case ISD::EH_SJLJ_SETUP_DISPATCH:
34097 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
34098 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
34099 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
34100 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
34101 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
34102 case ISD::CTLZ:
34103 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
34104 case ISD::CTTZ:
34105 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
34106 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
34107 case ISD::MULHS:
34108 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
34109 case ISD::ROTL:
34110 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
34111 case ISD::SRA:
34112 case ISD::SRL:
34113 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
34114 case ISD::SADDO:
34115 case ISD::UADDO:
34116 case ISD::SSUBO:
34117 case ISD::USUBO: return LowerXALUO(Op, DAG);
34118 case ISD::SMULO:
34119 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
34120 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
34121 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
34122 case ISD::SADDO_CARRY:
34123 case ISD::SSUBO_CARRY:
34124 case ISD::UADDO_CARRY:
34125 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
34126 case ISD::ADD:
34127 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
34128 case ISD::UADDSAT:
34129 case ISD::SADDSAT:
34130 case ISD::USUBSAT:
34131 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
34132 case ISD::SMAX:
34133 case ISD::SMIN:
34134 case ISD::UMAX:
34135 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
34136 case ISD::FMINIMUM:
34137 case ISD::FMAXIMUM:
34138 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
34139 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
34140 case ISD::ABDS:
34141 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
34142 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
34143 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
34144 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
34145 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
34146 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
34147 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
34148 case ISD::GC_TRANSITION_START:
34149 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
34150 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
34151 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
34152 }
34153}
34154
34155/// Replace a node with an illegal result type with a new node built out of
34156/// custom code.
34157void X86TargetLowering::ReplaceNodeResults(SDNode *N,
34158 SmallVectorImpl<SDValue>&Results,
34159 SelectionDAG &DAG) const {
34160 SDLoc dl(N);
34161 switch (N->getOpcode()) {
34162 default:
34163#ifndef NDEBUG
34164 dbgs() << "ReplaceNodeResults: ";
34165 N->dump(&DAG);
34166#endif
34167 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34167)
;
34168 case X86ISD::CVTPH2PS: {
34169 EVT VT = N->getValueType(0);
34170 SDValue Lo, Hi;
34171 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34172 EVT LoVT, HiVT;
34173 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
34174 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
34175 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
34176 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34177 Results.push_back(Res);
34178 return;
34179 }
34180 case X86ISD::STRICT_CVTPH2PS: {
34181 EVT VT = N->getValueType(0);
34182 SDValue Lo, Hi;
34183 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
34184 EVT LoVT, HiVT;
34185 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
34186 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
34187 {N->getOperand(0), Lo});
34188 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
34189 {N->getOperand(0), Hi});
34190 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34191 Lo.getValue(1), Hi.getValue(1));
34192 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34193 Results.push_back(Res);
34194 Results.push_back(Chain);
34195 return;
34196 }
34197 case X86ISD::CVTPS2PH:
34198 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
34199 return;
34200 case ISD::CTPOP: {
34201 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34201, __extension__
__PRETTY_FUNCTION__))
;
34202 // Use a v2i64 if possible.
34203 bool NoImplicitFloatOps =
34204 DAG.getMachineFunction().getFunction().hasFnAttribute(
34205 Attribute::NoImplicitFloat);
34206 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
34207 SDValue Wide =
34208 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
34209 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
34210 // Bit count should fit in 32-bits, extract it as that and then zero
34211 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
34212 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
34213 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
34214 DAG.getIntPtrConstant(0, dl));
34215 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
34216 Results.push_back(Wide);
34217 }
34218 return;
34219 }
34220 case ISD::MUL: {
34221 EVT VT = N->getValueType(0);
34222 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__
__PRETTY_FUNCTION__))
34223 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__
__PRETTY_FUNCTION__))
;
34224 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
34225 // elements are needed.
34226 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
34227 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
34228 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
34229 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
34230 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34231 unsigned NumConcats = 16 / VT.getVectorNumElements();
34232 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34233 ConcatOps[0] = Res;
34234 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
34235 Results.push_back(Res);
34236 return;
34237 }
34238 case ISD::SMULO:
34239 case ISD::UMULO: {
34240 EVT VT = N->getValueType(0);
34241 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__))
34242 VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__))
;
34243 bool IsSigned = N->getOpcode() == ISD::SMULO;
34244 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
34245 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
34246 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
34247 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
34248 // Extract the high 32 bits from each result using PSHUFD.
34249 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
34250 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
34251 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
34252 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
34253 DAG.getIntPtrConstant(0, dl));
34254
34255 // Truncate the low bits of the result. This will become PSHUFD.
34256 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34257
34258 SDValue HiCmp;
34259 if (IsSigned) {
34260 // SMULO overflows if the high bits don't match the sign of the low.
34261 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
34262 } else {
34263 // UMULO overflows if the high bits are non-zero.
34264 HiCmp = DAG.getConstant(0, dl, VT);
34265 }
34266 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
34267
34268 // Widen the result with by padding with undef.
34269 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34270 DAG.getUNDEF(VT));
34271 Results.push_back(Res);
34272 Results.push_back(Ovf);
34273 return;
34274 }
34275 case X86ISD::VPMADDWD: {
34276 // Legalize types for X86ISD::VPMADDWD by widening.
34277 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34277, __extension__
__PRETTY_FUNCTION__))
;
34278
34279 EVT VT = N->getValueType(0);
34280 EVT InVT = N->getOperand(0).getValueType();
34281 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__
__PRETTY_FUNCTION__))
34282 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__
__PRETTY_FUNCTION__))
;
34283 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__
__PRETTY_FUNCTION__))
34284 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__
__PRETTY_FUNCTION__))
;
34285 unsigned NumConcat = 128 / InVT.getSizeInBits();
34286
34287 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
34288 InVT.getVectorElementType(),
34289 NumConcat * InVT.getVectorNumElements());
34290 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
34291 VT.getVectorElementType(),
34292 NumConcat * VT.getVectorNumElements());
34293
34294 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
34295 Ops[0] = N->getOperand(0);
34296 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34297 Ops[0] = N->getOperand(1);
34298 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34299
34300 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
34301 Results.push_back(Res);
34302 return;
34303 }
34304 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
34305 case X86ISD::FMINC:
34306 case X86ISD::FMIN:
34307 case X86ISD::FMAXC:
34308 case X86ISD::FMAX: {
34309 EVT VT = N->getValueType(0);
34310 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__
__PRETTY_FUNCTION__))
;
34311 SDValue UNDEF = DAG.getUNDEF(VT);
34312 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34313 N->getOperand(0), UNDEF);
34314 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34315 N->getOperand(1), UNDEF);
34316 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
34317 return;
34318 }
34319 case ISD::SDIV:
34320 case ISD::UDIV:
34321 case ISD::SREM:
34322 case ISD::UREM: {
34323 EVT VT = N->getValueType(0);
34324 if (VT.isVector()) {
34325 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__
__PRETTY_FUNCTION__))
34326 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__
__PRETTY_FUNCTION__))
;
34327 // If this RHS is a constant splat vector we can widen this and let
34328 // division/remainder by constant optimize it.
34329 // TODO: Can we do something for non-splat?
34330 APInt SplatVal;
34331 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
34332 unsigned NumConcats = 128 / VT.getSizeInBits();
34333 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
34334 Ops0[0] = N->getOperand(0);
34335 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
34336 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
34337 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
34338 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
34339 Results.push_back(Res);
34340 }
34341 return;
34342 }
34343
34344 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
34345 Results.push_back(V);
34346 return;
34347 }
34348 case ISD::TRUNCATE: {
34349 MVT VT = N->getSimpleValueType(0);
34350 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
34351 return;
34352
34353 // The generic legalizer will try to widen the input type to the same
34354 // number of elements as the widened result type. But this isn't always
34355 // the best thing so do some custom legalization to avoid some cases.
34356 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
34357 SDValue In = N->getOperand(0);
34358 EVT InVT = In.getValueType();
34359
34360 unsigned InBits = InVT.getSizeInBits();
34361 if (128 % InBits == 0) {
34362 // 128 bit and smaller inputs should avoid truncate all together and
34363 // just use a build_vector that will become a shuffle.
34364 // TODO: Widen and use a shuffle directly?
34365 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
34366 EVT EltVT = VT.getVectorElementType();
34367 unsigned WidenNumElts = WidenVT.getVectorNumElements();
34368 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
34369 // Use the original element count so we don't do more scalar opts than
34370 // necessary.
34371 unsigned MinElts = VT.getVectorNumElements();
34372 for (unsigned i=0; i < MinElts; ++i) {
34373 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
34374 DAG.getIntPtrConstant(i, dl));
34375 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
34376 }
34377 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
34378 return;
34379 }
34380 // With AVX512 there are some cases that can use a target specific
34381 // truncate node to go from 256/512 to less than 128 with zeros in the
34382 // upper elements of the 128 bit result.
34383 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34384 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34385 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34386 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34387 return;
34388 }
34389 // There's one case we can widen to 512 bits and use VTRUNC.
34390 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34391 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34392 DAG.getUNDEF(MVT::v4i64));
34393 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34394 return;
34395 }
34396 }
34397 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34398 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34399 isTypeLegal(MVT::v4i64)) {
34400 // Input needs to be split and output needs to widened. Let's use two
34401 // VTRUNCs, and shuffle their results together into the wider type.
34402 SDValue Lo, Hi;
34403 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34404
34405 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34406 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34407 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34408 { 0, 1, 2, 3, 16, 17, 18, 19,
34409 -1, -1, -1, -1, -1, -1, -1, -1 });
34410 Results.push_back(Res);
34411 return;
34412 }
34413
34414 return;
34415 }
34416 case ISD::ANY_EXTEND:
34417 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34418 // It's intended to custom handle the input type.
34419 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__
__PRETTY_FUNCTION__))
34420 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__
__PRETTY_FUNCTION__))
;
34421 return;
34422 case ISD::SIGN_EXTEND:
34423 case ISD::ZERO_EXTEND: {
34424 EVT VT = N->getValueType(0);
34425 SDValue In = N->getOperand(0);
34426 EVT InVT = In.getValueType();
34427 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34428 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34429 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__
__PRETTY_FUNCTION__))
34430 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__
__PRETTY_FUNCTION__))
;
34431 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34431, __extension__
__PRETTY_FUNCTION__))
;
34432 // Custom split this so we can extend i8/i16->i32 invec. This is better
34433 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34434 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34435 // we allow the sra from the extend to i32 to be shared by the split.
34436 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34437
34438 // Fill a vector with sign bits for each element.
34439 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34440 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34441
34442 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34443 // to v2i64.
34444 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34445 {0, 4, 1, 5});
34446 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34447 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34448 {2, 6, 3, 7});
34449 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34450
34451 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34452 Results.push_back(Res);
34453 return;
34454 }
34455
34456 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34457 if (!InVT.is128BitVector()) {
34458 // Not a 128 bit vector, but maybe type legalization will promote
34459 // it to 128 bits.
34460 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34461 return;
34462 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34463 if (!InVT.is128BitVector())
34464 return;
34465
34466 // Promote the input to 128 bits. Type legalization will turn this into
34467 // zext_inreg/sext_inreg.
34468 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
34469 }
34470
34471 // Perform custom splitting instead of the two stage extend we would get
34472 // by default.
34473 EVT LoVT, HiVT;
34474 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34475 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34475, __extension__
__PRETTY_FUNCTION__))
;
34476
34477 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
34478
34479 // We need to shift the input over by half the number of elements.
34480 unsigned NumElts = InVT.getVectorNumElements();
34481 unsigned HalfNumElts = NumElts / 2;
34482 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34483 for (unsigned i = 0; i != HalfNumElts; ++i)
34484 ShufMask[i] = i + HalfNumElts;
34485
34486 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34487 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
34488
34489 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34490 Results.push_back(Res);
34491 }
34492 return;
34493 }
34494 case ISD::FP_TO_SINT:
34495 case ISD::STRICT_FP_TO_SINT:
34496 case ISD::FP_TO_UINT:
34497 case ISD::STRICT_FP_TO_UINT: {
34498 bool IsStrict = N->isStrictFPOpcode();
34499 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
34500 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
34501 EVT VT = N->getValueType(0);
34502 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34503 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34504 EVT SrcVT = Src.getValueType();
34505
34506 SDValue Res;
34507 if (isSoftFP16(SrcVT)) {
34508 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34509 if (IsStrict) {
34510 Res =
34511 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
34512 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34513 {NVT, MVT::Other}, {Chain, Src})});
34514 Chain = Res.getValue(1);
34515 } else {
34516 Res = DAG.getNode(N->getOpcode(), dl, VT,
34517 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34518 }
34519 Results.push_back(Res);
34520 if (IsStrict)
34521 Results.push_back(Chain);
34522
34523 return;
34524 }
34525
34526 if (VT.isVector() && Subtarget.hasFP16() &&
34527 SrcVT.getVectorElementType() == MVT::f16) {
34528 EVT EleVT = VT.getVectorElementType();
34529 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34530
34531 if (SrcVT != MVT::v8f16) {
34532 SDValue Tmp =
34533 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34534 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34535 Ops[0] = Src;
34536 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34537 }
34538
34539 if (IsStrict) {
34540 unsigned Opc =
34541 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34542 Res =
34543 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34544 Chain = Res.getValue(1);
34545 } else {
34546 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34547 Res = DAG.getNode(Opc, dl, ResVT, Src);
34548 }
34549
34550 // TODO: Need to add exception check code for strict FP.
34551 if (EleVT.getSizeInBits() < 16) {
34552 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34553 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34554
34555 // Now widen to 128 bits.
34556 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34557 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34558 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34559 ConcatOps[0] = Res;
34560 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34561 }
34562
34563 Results.push_back(Res);
34564 if (IsStrict)
34565 Results.push_back(Chain);
34566
34567 return;
34568 }
34569
34570 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34571 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__
__PRETTY_FUNCTION__))
34572 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__
__PRETTY_FUNCTION__))
;
34573
34574 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34575 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34576 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34577 VT.getVectorNumElements());
34578 SDValue Res;
34579 SDValue Chain;
34580 if (IsStrict) {
34581 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34582 {N->getOperand(0), Src});
34583 Chain = Res.getValue(1);
34584 } else
34585 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34586
34587 // Preserve what we know about the size of the original result. If the
34588 // result is v2i32, we have to manually widen the assert.
34589 if (PromoteVT == MVT::v2i32)
34590 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34591 DAG.getUNDEF(MVT::v2i32));
34592
34593 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34594 Res.getValueType(), Res,
34595 DAG.getValueType(VT.getVectorElementType()));
34596
34597 if (PromoteVT == MVT::v2i32)
34598 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34599 DAG.getIntPtrConstant(0, dl));
34600
34601 // Truncate back to the original width.
34602 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34603
34604 // Now widen to 128 bits.
34605 unsigned NumConcats = 128 / VT.getSizeInBits();
34606 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
34607 VT.getVectorNumElements() * NumConcats);
34608 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34609 ConcatOps[0] = Res;
34610 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34611 Results.push_back(Res);
34612 if (IsStrict)
34613 Results.push_back(Chain);
34614 return;
34615 }
34616
34617
34618 if (VT == MVT::v2i32) {
34619 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__
__PRETTY_FUNCTION__))
34620 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__
__PRETTY_FUNCTION__))
;
34621 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34621, __extension__
__PRETTY_FUNCTION__))
;
34622 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__
__PRETTY_FUNCTION__))
34623 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__
__PRETTY_FUNCTION__))
;
34624 if (Src.getValueType() == MVT::v2f64) {
34625 if (!IsSigned && !Subtarget.hasAVX512()) {
34626 SDValue Res =
34627 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34628 Results.push_back(Res);
34629 return;
34630 }
34631
34632 unsigned Opc;
34633 if (IsStrict)
34634 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34635 else
34636 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34637
34638 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34639 if (!IsSigned && !Subtarget.hasVLX()) {
34640 // Otherwise we can defer to the generic legalizer which will widen
34641 // the input as well. This will be further widened during op
34642 // legalization to v8i32<-v8f64.
34643 // For strict nodes we'll need to widen ourselves.
34644 // FIXME: Fix the type legalizer to safely widen strict nodes?
34645 if (!IsStrict)
34646 return;
34647 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34648 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34649 Opc = N->getOpcode();
34650 }
34651 SDValue Res;
34652 SDValue Chain;
34653 if (IsStrict) {
34654 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34655 {N->getOperand(0), Src});
34656 Chain = Res.getValue(1);
34657 } else {
34658 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34659 }
34660 Results.push_back(Res);
34661 if (IsStrict)
34662 Results.push_back(Chain);
34663 return;
34664 }
34665
34666 // Custom widen strict v2f32->v2i32 by padding with zeros.
34667 // FIXME: Should generic type legalizer do this?
34668 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34669 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34670 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34671 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
34672 {N->getOperand(0), Src});
34673 Results.push_back(Res);
34674 Results.push_back(Res.getValue(1));
34675 return;
34676 }
34677
34678 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34679 // so early out here.
34680 return;
34681 }
34682
34683 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34683, __extension__
__PRETTY_FUNCTION__))
;
34684
34685 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34686 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34687 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34688 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__
__PRETTY_FUNCTION__))
;
34689 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34690 // If we use a 128-bit result we might need to use a target specific node.
34691 unsigned SrcElts =
34692 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34693 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34694 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34695 unsigned Opc = N->getOpcode();
34696 if (NumElts != SrcElts) {
34697 if (IsStrict)
34698 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34699 else
34700 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34701 }
34702
34703 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
34704 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34705 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34706 ZeroIdx);
34707 SDValue Chain;
34708 if (IsStrict) {
34709 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34710 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34711 Chain = Res.getValue(1);
34712 } else
34713 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34714 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34715 Results.push_back(Res);
34716 if (IsStrict)
34717 Results.push_back(Chain);
34718 return;
34719 }
34720
34721 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34722 SDValue Chain;
34723 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34724 Results.push_back(V);
34725 if (IsStrict)
34726 Results.push_back(Chain);
34727 return;
34728 }
34729
34730 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34731 Results.push_back(V);
34732 if (IsStrict)
34733 Results.push_back(Chain);
34734 }
34735 return;
34736 }
34737 case ISD::LRINT:
34738 case ISD::LLRINT: {
34739 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34740 Results.push_back(V);
34741 return;
34742 }
34743
34744 case ISD::SINT_TO_FP:
34745 case ISD::STRICT_SINT_TO_FP:
34746 case ISD::UINT_TO_FP:
34747 case ISD::STRICT_UINT_TO_FP: {
34748 bool IsStrict = N->isStrictFPOpcode();
34749 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
34750 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
34751 EVT VT = N->getValueType(0);
34752 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34753 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34754 Subtarget.hasVLX()) {
34755 if (Src.getValueType().getVectorElementType() == MVT::i16)
34756 return;
34757
34758 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34759 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34760 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34761 : DAG.getUNDEF(MVT::v2i32));
34762 if (IsStrict) {
34763 unsigned Opc =
34764 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
34765 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34766 {N->getOperand(0), Src});
34767 Results.push_back(Res);
34768 Results.push_back(Res.getValue(1));
34769 } else {
34770 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34771 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34772 }
34773 return;
34774 }
34775 if (VT != MVT::v2f32)
34776 return;
34777 EVT SrcVT = Src.getValueType();
34778 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34779 if (IsStrict) {
34780 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34781 : X86ISD::STRICT_CVTUI2P;
34782 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34783 {N->getOperand(0), Src});
34784 Results.push_back(Res);
34785 Results.push_back(Res.getValue(1));
34786 } else {
34787 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34788 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34789 }
34790 return;
34791 }
34792 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34793 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34794 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34795 SDValue One = DAG.getConstant(1, dl, SrcVT);
34796 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34797 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34798 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34799 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34800 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34801 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34802 for (int i = 0; i != 2; ++i) {
34803 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34804 SignSrc, DAG.getIntPtrConstant(i, dl));
34805 if (IsStrict)
34806 SignCvts[i] =
34807 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34808 {N->getOperand(0), Elt});
34809 else
34810 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34811 };
34812 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34813 SDValue Slow, Chain;
34814 if (IsStrict) {
34815 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34816 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34817 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34818 {Chain, SignCvt, SignCvt});
34819 Chain = Slow.getValue(1);
34820 } else {
34821 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34822 }
34823 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34824 IsNeg =
34825 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34826 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34827 Results.push_back(Cvt);
34828 if (IsStrict)
34829 Results.push_back(Chain);
34830 return;
34831 }
34832
34833 if (SrcVT != MVT::v2i32)
34834 return;
34835
34836 if (IsSigned || Subtarget.hasAVX512()) {
34837 if (!IsStrict)
34838 return;
34839
34840 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34841 // FIXME: Should generic type legalizer do this?
34842 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34843 DAG.getConstant(0, dl, MVT::v2i32));
34844 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
34845 {N->getOperand(0), Src});
34846 Results.push_back(Res);
34847 Results.push_back(Res.getValue(1));
34848 return;
34849 }
34850
34851 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34851, __extension__
__PRETTY_FUNCTION__))
;
34852 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34853 SDValue VBias = DAG.getConstantFP(
34854 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34855 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34856 DAG.getBitcast(MVT::v2i64, VBias));
34857 Or = DAG.getBitcast(MVT::v2f64, Or);
34858 if (IsStrict) {
34859 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34860 {N->getOperand(0), Or, VBias});
34861 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
34862 {MVT::v4f32, MVT::Other},
34863 {Sub.getValue(1), Sub});
34864 Results.push_back(Res);
34865 Results.push_back(Res.getValue(1));
34866 } else {
34867 // TODO: Are there any fast-math-flags to propagate here?
34868 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34869 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34870 }
34871 return;
34872 }
34873 case ISD::STRICT_FP_ROUND:
34874 case ISD::FP_ROUND: {
34875 bool IsStrict = N->isStrictFPOpcode();
34876 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34877 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34878 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34879 EVT SrcVT = Src.getValueType();
34880 EVT VT = N->getValueType(0);
34881 SDValue V;
34882 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34883 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34884 : DAG.getUNDEF(MVT::v2f32);
34885 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34886 }
34887 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34888 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34888, __extension__
__PRETTY_FUNCTION__))
;
34889 if (SrcVT.getVectorElementType() != MVT::f32)
34890 return;
34891
34892 if (IsStrict)
34893 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34894 {Chain, Src, Rnd});
34895 else
34896 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34897
34898 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34899 if (IsStrict)
34900 Results.push_back(V.getValue(1));
34901 return;
34902 }
34903 if (!isTypeLegal(Src.getValueType()))
34904 return;
34905 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34906 if (IsStrict)
34907 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34908 {Chain, Src});
34909 else
34910 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34911 Results.push_back(V);
34912 if (IsStrict)
34913 Results.push_back(V.getValue(1));
34914 return;
34915 }
34916 case ISD::FP_EXTEND:
34917 case ISD::STRICT_FP_EXTEND: {
34918 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34919 // No other ValueType for FP_EXTEND should reach this point.
34920 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__
__PRETTY_FUNCTION__))
34921 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__
__PRETTY_FUNCTION__))
;
34922 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34923 return;
34924 bool IsStrict = N->isStrictFPOpcode();
34925 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34926 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34927 : DAG.getUNDEF(MVT::v2f16);
34928 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34929 if (IsStrict)
34930 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34931 {N->getOperand(0), V});
34932 else
34933 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34934 Results.push_back(V);
34935 if (IsStrict)
34936 Results.push_back(V.getValue(1));
34937 return;
34938 }
34939 case ISD::INTRINSIC_W_CHAIN: {
34940 unsigned IntNo = N->getConstantOperandVal(1);
34941 switch (IntNo) {
34942 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34943)
34943 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34943)
;
34944 case Intrinsic::x86_rdtsc:
34945 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34946 Results);
34947 case Intrinsic::x86_rdtscp:
34948 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34949 Results);
34950 case Intrinsic::x86_rdpmc:
34951 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34952 Results);
34953 return;
34954 case Intrinsic::x86_rdpru:
34955 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34956 Results);
34957 return;
34958 case Intrinsic::x86_xgetbv:
34959 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34960 Results);
34961 return;
34962 }
34963 }
34964 case ISD::READCYCLECOUNTER: {
34965 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34966 }
34967 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34968 EVT T = N->getValueType(0);
34969 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34969, __extension__
__PRETTY_FUNCTION__))
;
34970 bool Regs64bit = T == MVT::i128;
34971 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__
__PRETTY_FUNCTION__))
34972 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__
__PRETTY_FUNCTION__))
;
34973 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34974 SDValue cpInL, cpInH;
34975 std::tie(cpInL, cpInH) =
34976 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34977 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34978 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34979 cpInH =
34980 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34981 cpInH, cpInL.getValue(1));
34982 SDValue swapInL, swapInH;
34983 std::tie(swapInL, swapInH) =
34984 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34985 swapInH =
34986 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34987 swapInH, cpInH.getValue(1));
34988
34989 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34990 // until later. So we keep the RBX input in a vreg and use a custom
34991 // inserter.
34992 // Since RBX will be a reserved register the register allocator will not
34993 // make sure its value will be properly saved and restored around this
34994 // live-range.
34995 SDValue Result;
34996 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34997 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34998 if (Regs64bit) {
34999 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
35000 swapInH.getValue(1)};
35001 Result =
35002 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
35003 } else {
35004 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
35005 swapInH.getValue(1));
35006 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
35007 swapInL.getValue(1)};
35008 Result =
35009 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
35010 }
35011
35012 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
35013 Regs64bit ? X86::RAX : X86::EAX,
35014 HalfT, Result.getValue(1));
35015 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
35016 Regs64bit ? X86::RDX : X86::EDX,
35017 HalfT, cpOutL.getValue(2));
35018 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
35019
35020 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
35021 MVT::i32, cpOutH.getValue(2));
35022 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
35023 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
35024
35025 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
35026 Results.push_back(Success);
35027 Results.push_back(EFLAGS.getValue(1));
35028 return;
35029 }
35030 case ISD::ATOMIC_LOAD: {
35031 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35031, __extension__
__PRETTY_FUNCTION__))
;
35032 bool NoImplicitFloatOps =
35033 DAG.getMachineFunction().getFunction().hasFnAttribute(
35034 Attribute::NoImplicitFloat);
35035 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
35036 auto *Node = cast<AtomicSDNode>(N);
35037 if (Subtarget.hasSSE1()) {
35038 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
35039 // Then extract the lower 64-bits.
35040 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
35041 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
35042 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
35043 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35044 MVT::i64, Node->getMemOperand());
35045 if (Subtarget.hasSSE2()) {
35046 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
35047 DAG.getIntPtrConstant(0, dl));
35048 Results.push_back(Res);
35049 Results.push_back(Ld.getValue(1));
35050 return;
35051 }
35052 // We use an alternative sequence for SSE1 that extracts as v2f32 and
35053 // then casts to i64. This avoids a 128-bit stack temporary being
35054 // created by type legalization if we were to cast v4f32->v2i64.
35055 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
35056 DAG.getIntPtrConstant(0, dl));
35057 Res = DAG.getBitcast(MVT::i64, Res);
35058 Results.push_back(Res);
35059 Results.push_back(Ld.getValue(1));
35060 return;
35061 }
35062 if (Subtarget.hasX87()) {
35063 // First load this into an 80-bit X87 register. This will put the whole
35064 // integer into the significand.
35065 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
35066 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
35067 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
35068 dl, Tys, Ops, MVT::i64,
35069 Node->getMemOperand());
35070 SDValue Chain = Result.getValue(1);
35071
35072 // Now store the X87 register to a stack temporary and convert to i64.
35073 // This store is not atomic and doesn't need to be.
35074 // FIXME: We don't need a stack temporary if the result of the load
35075 // is already being stored. We could just directly store there.
35076 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
35077 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
35078 MachinePointerInfo MPI =
35079 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
35080 SDValue StoreOps[] = { Chain, Result, StackPtr };
35081 Chain = DAG.getMemIntrinsicNode(
35082 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
35083 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
35084
35085 // Finally load the value back from the stack temporary and return it.
35086 // This load is not atomic and doesn't need to be.
35087 // This load will be further type legalized.
35088 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
35089 Results.push_back(Result);
35090 Results.push_back(Result.getValue(1));
35091 return;
35092 }
35093 }
35094 // TODO: Use MOVLPS when SSE1 is available?
35095 // Delegate to generic TypeLegalization. Situations we can really handle
35096 // should have already been dealt with by AtomicExpandPass.cpp.
35097 break;
35098 }
35099 case ISD::ATOMIC_SWAP:
35100 case ISD::ATOMIC_LOAD_ADD:
35101 case ISD::ATOMIC_LOAD_SUB:
35102 case ISD::ATOMIC_LOAD_AND:
35103 case ISD::ATOMIC_LOAD_OR:
35104 case ISD::ATOMIC_LOAD_XOR:
35105 case ISD::ATOMIC_LOAD_NAND:
35106 case ISD::ATOMIC_LOAD_MIN:
35107 case ISD::ATOMIC_LOAD_MAX:
35108 case ISD::ATOMIC_LOAD_UMIN:
35109 case ISD::ATOMIC_LOAD_UMAX:
35110 // Delegate to generic TypeLegalization. Situations we can really handle
35111 // should have already been dealt with by AtomicExpandPass.cpp.
35112 break;
35113
35114 case ISD::BITCAST: {
35115 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35115, __extension__
__PRETTY_FUNCTION__))
;
35116 EVT DstVT = N->getValueType(0);
35117 EVT SrcVT = N->getOperand(0).getValueType();
35118
35119 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
35120 // we can split using the k-register rather than memory.
35121 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
35122 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35122, __extension__
__PRETTY_FUNCTION__))
;
35123 SDValue Lo, Hi;
35124 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
35125 Lo = DAG.getBitcast(MVT::i32, Lo);
35126 Hi = DAG.getBitcast(MVT::i32, Hi);
35127 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
35128 Results.push_back(Res);
35129 return;
35130 }
35131
35132 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
35133 // FIXME: Use v4f32 for SSE1?
35134 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35134, __extension__
__PRETTY_FUNCTION__))
;
35135 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__
__PRETTY_FUNCTION__))
35136 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__
__PRETTY_FUNCTION__))
;
35137 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
35138 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
35139 N->getOperand(0));
35140 Res = DAG.getBitcast(WideVT, Res);
35141 Results.push_back(Res);
35142 return;
35143 }
35144
35145 return;
35146 }
35147 case ISD::MGATHER: {
35148 EVT VT = N->getValueType(0);
35149 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
35150 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
35151 auto *Gather = cast<MaskedGatherSDNode>(N);
35152 SDValue Index = Gather->getIndex();
35153 if (Index.getValueType() != MVT::v2i64)
35154 return;
35155 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__
__PRETTY_FUNCTION__))
35156 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__
__PRETTY_FUNCTION__))
;
35157 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
35158 SDValue Mask = Gather->getMask();
35159 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35159, __extension__
__PRETTY_FUNCTION__))
;
35160 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
35161 Gather->getPassThru(),
35162 DAG.getUNDEF(VT));
35163 if (!Subtarget.hasVLX()) {
35164 // We need to widen the mask, but the instruction will only use 2
35165 // of its elements. So we can use undef.
35166 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
35167 DAG.getUNDEF(MVT::v2i1));
35168 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
35169 }
35170 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
35171 Gather->getBasePtr(), Index, Gather->getScale() };
35172 SDValue Res = DAG.getMemIntrinsicNode(
35173 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
35174 Gather->getMemoryVT(), Gather->getMemOperand());
35175 Results.push_back(Res);
35176 Results.push_back(Res.getValue(1));
35177 return;
35178 }
35179 return;
35180 }
35181 case ISD::LOAD: {
35182 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
35183 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
35184 // cast since type legalization will try to use an i64 load.
35185 MVT VT = N->getSimpleValueType(0);
35186 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35186, __extension__
__PRETTY_FUNCTION__))
;
35187 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__
__PRETTY_FUNCTION__))
35188 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__
__PRETTY_FUNCTION__))
;
35189 if (!ISD::isNON_EXTLoad(N))
35190 return;
35191 auto *Ld = cast<LoadSDNode>(N);
35192 if (Subtarget.hasSSE2()) {
35193 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
35194 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
35195 Ld->getPointerInfo(), Ld->getOriginalAlign(),
35196 Ld->getMemOperand()->getFlags());
35197 SDValue Chain = Res.getValue(1);
35198 MVT VecVT = MVT::getVectorVT(LdVT, 2);
35199 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
35200 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
35201 Res = DAG.getBitcast(WideVT, Res);
35202 Results.push_back(Res);
35203 Results.push_back(Chain);
35204 return;
35205 }
35206 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35206, __extension__
__PRETTY_FUNCTION__))
;
35207 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
35208 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
35209 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35210 MVT::i64, Ld->getMemOperand());
35211 Results.push_back(Res);
35212 Results.push_back(Res.getValue(1));
35213 return;
35214 }
35215 case ISD::ADDRSPACECAST: {
35216 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
35217 Results.push_back(V);
35218 return;
35219 }
35220 case ISD::BITREVERSE: {
35221 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35221, __extension__
__PRETTY_FUNCTION__))
;
35222 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35222, __extension__
__PRETTY_FUNCTION__))
;
35223 // We can use VPPERM by copying to a vector register and back. We'll need
35224 // to move the scalar in two i32 pieces.
35225 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
35226 return;
35227 }
35228 case ISD::EXTRACT_VECTOR_ELT: {
35229 // f16 = extract vXf16 %vec, i64 %idx
35230 assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__
__PRETTY_FUNCTION__))
35231 "Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__
__PRETTY_FUNCTION__))
;
35232 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35232, __extension__
__PRETTY_FUNCTION__))
;
35233 SDValue VecOp = N->getOperand(0);
35234 EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
35235 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
35236 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
35237 N->getOperand(1));
35238 Split = DAG.getBitcast(MVT::f16, Split);
35239 Results.push_back(Split);
35240 return;
35241 }
35242 }
35243}
35244
35245const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
35246 switch ((X86ISD::NodeType)Opcode) {
35247 case X86ISD::FIRST_NUMBER: break;
35248#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
35249 NODE_NAME_CASE(BSF)
35250 NODE_NAME_CASE(BSR)
35251 NODE_NAME_CASE(FSHL)
35252 NODE_NAME_CASE(FSHR)
35253 NODE_NAME_CASE(FAND)
35254 NODE_NAME_CASE(FANDN)
35255 NODE_NAME_CASE(FOR)
35256 NODE_NAME_CASE(FXOR)
35257 NODE_NAME_CASE(FILD)
35258 NODE_NAME_CASE(FIST)
35259 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
35260 NODE_NAME_CASE(FLD)
35261 NODE_NAME_CASE(FST)
35262 NODE_NAME_CASE(CALL)
35263 NODE_NAME_CASE(CALL_RVMARKER)
35264 NODE_NAME_CASE(BT)
35265 NODE_NAME_CASE(CMP)
35266 NODE_NAME_CASE(FCMP)
35267 NODE_NAME_CASE(STRICT_FCMP)
35268 NODE_NAME_CASE(STRICT_FCMPS)
35269 NODE_NAME_CASE(COMI)
35270 NODE_NAME_CASE(UCOMI)
35271 NODE_NAME_CASE(CMPM)
35272 NODE_NAME_CASE(CMPMM)
35273 NODE_NAME_CASE(STRICT_CMPM)
35274 NODE_NAME_CASE(CMPMM_SAE)
35275 NODE_NAME_CASE(SETCC)
35276 NODE_NAME_CASE(SETCC_CARRY)
35277 NODE_NAME_CASE(FSETCC)
35278 NODE_NAME_CASE(FSETCCM)
35279 NODE_NAME_CASE(FSETCCM_SAE)
35280 NODE_NAME_CASE(CMOV)
35281 NODE_NAME_CASE(BRCOND)
35282 NODE_NAME_CASE(RET_GLUE)
35283 NODE_NAME_CASE(IRET)
35284 NODE_NAME_CASE(REP_STOS)
35285 NODE_NAME_CASE(REP_MOVS)
35286 NODE_NAME_CASE(GlobalBaseReg)
35287 NODE_NAME_CASE(Wrapper)
35288 NODE_NAME_CASE(WrapperRIP)
35289 NODE_NAME_CASE(MOVQ2DQ)
35290 NODE_NAME_CASE(MOVDQ2Q)
35291 NODE_NAME_CASE(MMX_MOVD2W)
35292 NODE_NAME_CASE(MMX_MOVW2D)
35293 NODE_NAME_CASE(PEXTRB)
35294 NODE_NAME_CASE(PEXTRW)
35295 NODE_NAME_CASE(INSERTPS)
35296 NODE_NAME_CASE(PINSRB)
35297 NODE_NAME_CASE(PINSRW)
35298 NODE_NAME_CASE(PSHUFB)
35299 NODE_NAME_CASE(ANDNP)
35300 NODE_NAME_CASE(BLENDI)
35301 NODE_NAME_CASE(BLENDV)
35302 NODE_NAME_CASE(HADD)
35303 NODE_NAME_CASE(HSUB)
35304 NODE_NAME_CASE(FHADD)
35305 NODE_NAME_CASE(FHSUB)
35306 NODE_NAME_CASE(CONFLICT)
35307 NODE_NAME_CASE(FMAX)
35308 NODE_NAME_CASE(FMAXS)
35309 NODE_NAME_CASE(FMAX_SAE)
35310 NODE_NAME_CASE(FMAXS_SAE)
35311 NODE_NAME_CASE(FMIN)
35312 NODE_NAME_CASE(FMINS)
35313 NODE_NAME_CASE(FMIN_SAE)
35314 NODE_NAME_CASE(FMINS_SAE)
35315 NODE_NAME_CASE(FMAXC)
35316 NODE_NAME_CASE(FMINC)
35317 NODE_NAME_CASE(FRSQRT)
35318 NODE_NAME_CASE(FRCP)
35319 NODE_NAME_CASE(EXTRQI)
35320 NODE_NAME_CASE(INSERTQI)
35321 NODE_NAME_CASE(TLSADDR)
35322 NODE_NAME_CASE(TLSBASEADDR)
35323 NODE_NAME_CASE(TLSCALL)
35324 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35325 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35326 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35327 NODE_NAME_CASE(EH_RETURN)
35328 NODE_NAME_CASE(TC_RETURN)
35329 NODE_NAME_CASE(FNSTCW16m)
35330 NODE_NAME_CASE(FLDCW16m)
35331 NODE_NAME_CASE(LCMPXCHG_DAG)
35332 NODE_NAME_CASE(LCMPXCHG8_DAG)
35333 NODE_NAME_CASE(LCMPXCHG16_DAG)
35334 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35335 NODE_NAME_CASE(LADD)
35336 NODE_NAME_CASE(LSUB)
35337 NODE_NAME_CASE(LOR)
35338 NODE_NAME_CASE(LXOR)
35339 NODE_NAME_CASE(LAND)
35340 NODE_NAME_CASE(LBTS)
35341 NODE_NAME_CASE(LBTC)
35342 NODE_NAME_CASE(LBTR)
35343 NODE_NAME_CASE(LBTS_RM)
35344 NODE_NAME_CASE(LBTC_RM)
35345 NODE_NAME_CASE(LBTR_RM)
35346 NODE_NAME_CASE(AADD)
35347 NODE_NAME_CASE(AOR)
35348 NODE_NAME_CASE(AXOR)
35349 NODE_NAME_CASE(AAND)
35350 NODE_NAME_CASE(VZEXT_MOVL)
35351 NODE_NAME_CASE(VZEXT_LOAD)
35352 NODE_NAME_CASE(VEXTRACT_STORE)
35353 NODE_NAME_CASE(VTRUNC)
35354 NODE_NAME_CASE(VTRUNCS)
35355 NODE_NAME_CASE(VTRUNCUS)
35356 NODE_NAME_CASE(VMTRUNC)
35357 NODE_NAME_CASE(VMTRUNCS)
35358 NODE_NAME_CASE(VMTRUNCUS)
35359 NODE_NAME_CASE(VTRUNCSTORES)
35360 NODE_NAME_CASE(VTRUNCSTOREUS)
35361 NODE_NAME_CASE(VMTRUNCSTORES)
35362 NODE_NAME_CASE(VMTRUNCSTOREUS)
35363 NODE_NAME_CASE(VFPEXT)
35364 NODE_NAME_CASE(STRICT_VFPEXT)
35365 NODE_NAME_CASE(VFPEXT_SAE)
35366 NODE_NAME_CASE(VFPEXTS)
35367 NODE_NAME_CASE(VFPEXTS_SAE)
35368 NODE_NAME_CASE(VFPROUND)
35369 NODE_NAME_CASE(STRICT_VFPROUND)
35370 NODE_NAME_CASE(VMFPROUND)
35371 NODE_NAME_CASE(VFPROUND_RND)
35372 NODE_NAME_CASE(VFPROUNDS)
35373 NODE_NAME_CASE(VFPROUNDS_RND)
35374 NODE_NAME_CASE(VSHLDQ)
35375 NODE_NAME_CASE(VSRLDQ)
35376 NODE_NAME_CASE(VSHL)
35377 NODE_NAME_CASE(VSRL)
35378 NODE_NAME_CASE(VSRA)
35379 NODE_NAME_CASE(VSHLI)
35380 NODE_NAME_CASE(VSRLI)
35381 NODE_NAME_CASE(VSRAI)
35382 NODE_NAME_CASE(VSHLV)
35383 NODE_NAME_CASE(VSRLV)
35384 NODE_NAME_CASE(VSRAV)
35385 NODE_NAME_CASE(VROTLI)
35386 NODE_NAME_CASE(VROTRI)
35387 NODE_NAME_CASE(VPPERM)
35388 NODE_NAME_CASE(CMPP)
35389 NODE_NAME_CASE(STRICT_CMPP)
35390 NODE_NAME_CASE(PCMPEQ)
35391 NODE_NAME_CASE(PCMPGT)
35392 NODE_NAME_CASE(PHMINPOS)
35393 NODE_NAME_CASE(ADD)
35394 NODE_NAME_CASE(SUB)
35395 NODE_NAME_CASE(ADC)
35396 NODE_NAME_CASE(SBB)
35397 NODE_NAME_CASE(SMUL)
35398 NODE_NAME_CASE(UMUL)
35399 NODE_NAME_CASE(OR)
35400 NODE_NAME_CASE(XOR)
35401 NODE_NAME_CASE(AND)
35402 NODE_NAME_CASE(BEXTR)
35403 NODE_NAME_CASE(BEXTRI)
35404 NODE_NAME_CASE(BZHI)
35405 NODE_NAME_CASE(PDEP)
35406 NODE_NAME_CASE(PEXT)
35407 NODE_NAME_CASE(MUL_IMM)
35408 NODE_NAME_CASE(MOVMSK)
35409 NODE_NAME_CASE(PTEST)
35410 NODE_NAME_CASE(TESTP)
35411 NODE_NAME_CASE(KORTEST)
35412 NODE_NAME_CASE(KTEST)
35413 NODE_NAME_CASE(KADD)
35414 NODE_NAME_CASE(KSHIFTL)
35415 NODE_NAME_CASE(KSHIFTR)
35416 NODE_NAME_CASE(PACKSS)
35417 NODE_NAME_CASE(PACKUS)
35418 NODE_NAME_CASE(PALIGNR)
35419 NODE_NAME_CASE(VALIGN)
35420 NODE_NAME_CASE(VSHLD)
35421 NODE_NAME_CASE(VSHRD)
35422 NODE_NAME_CASE(VSHLDV)
35423 NODE_NAME_CASE(VSHRDV)
35424 NODE_NAME_CASE(PSHUFD)
35425 NODE_NAME_CASE(PSHUFHW)
35426 NODE_NAME_CASE(PSHUFLW)
35427 NODE_NAME_CASE(SHUFP)
35428 NODE_NAME_CASE(SHUF128)
35429 NODE_NAME_CASE(MOVLHPS)
35430 NODE_NAME_CASE(MOVHLPS)
35431 NODE_NAME_CASE(MOVDDUP)
35432 NODE_NAME_CASE(MOVSHDUP)
35433 NODE_NAME_CASE(MOVSLDUP)
35434 NODE_NAME_CASE(MOVSD)
35435 NODE_NAME_CASE(MOVSS)
35436 NODE_NAME_CASE(MOVSH)
35437 NODE_NAME_CASE(UNPCKL)
35438 NODE_NAME_CASE(UNPCKH)
35439 NODE_NAME_CASE(VBROADCAST)
35440 NODE_NAME_CASE(VBROADCAST_LOAD)
35441 NODE_NAME_CASE(VBROADCASTM)
35442 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35443 NODE_NAME_CASE(VPERMILPV)
35444 NODE_NAME_CASE(VPERMILPI)
35445 NODE_NAME_CASE(VPERM2X128)
35446 NODE_NAME_CASE(VPERMV)
35447 NODE_NAME_CASE(VPERMV3)
35448 NODE_NAME_CASE(VPERMI)
35449 NODE_NAME_CASE(VPTERNLOG)
35450 NODE_NAME_CASE(VFIXUPIMM)
35451 NODE_NAME_CASE(VFIXUPIMM_SAE)
35452 NODE_NAME_CASE(VFIXUPIMMS)
35453 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35454 NODE_NAME_CASE(VRANGE)
35455 NODE_NAME_CASE(VRANGE_SAE)
35456 NODE_NAME_CASE(VRANGES)
35457 NODE_NAME_CASE(VRANGES_SAE)
35458 NODE_NAME_CASE(PMULUDQ)
35459 NODE_NAME_CASE(PMULDQ)
35460 NODE_NAME_CASE(PSADBW)
35461 NODE_NAME_CASE(DBPSADBW)
35462 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35463 NODE_NAME_CASE(VAARG_64)
35464 NODE_NAME_CASE(VAARG_X32)
35465 NODE_NAME_CASE(DYN_ALLOCA)
35466 NODE_NAME_CASE(MFENCE)
35467 NODE_NAME_CASE(SEG_ALLOCA)
35468 NODE_NAME_CASE(PROBED_ALLOCA)
35469 NODE_NAME_CASE(RDRAND)
35470 NODE_NAME_CASE(RDSEED)
35471 NODE_NAME_CASE(RDPKRU)
35472 NODE_NAME_CASE(WRPKRU)
35473 NODE_NAME_CASE(VPMADDUBSW)
35474 NODE_NAME_CASE(VPMADDWD)
35475 NODE_NAME_CASE(VPSHA)
35476 NODE_NAME_CASE(VPSHL)
35477 NODE_NAME_CASE(VPCOM)
35478 NODE_NAME_CASE(VPCOMU)
35479 NODE_NAME_CASE(VPERMIL2)
35480 NODE_NAME_CASE(FMSUB)
35481 NODE_NAME_CASE(STRICT_FMSUB)
35482 NODE_NAME_CASE(FNMADD)
35483 NODE_NAME_CASE(STRICT_FNMADD)
35484 NODE_NAME_CASE(FNMSUB)
35485 NODE_NAME_CASE(STRICT_FNMSUB)
35486 NODE_NAME_CASE(FMADDSUB)
35487 NODE_NAME_CASE(FMSUBADD)
35488 NODE_NAME_CASE(FMADD_RND)
35489 NODE_NAME_CASE(FNMADD_RND)
35490 NODE_NAME_CASE(FMSUB_RND)
35491 NODE_NAME_CASE(FNMSUB_RND)
35492 NODE_NAME_CASE(FMADDSUB_RND)
35493 NODE_NAME_CASE(FMSUBADD_RND)
35494 NODE_NAME_CASE(VFMADDC)
35495 NODE_NAME_CASE(VFMADDC_RND)
35496 NODE_NAME_CASE(VFCMADDC)
35497 NODE_NAME_CASE(VFCMADDC_RND)
35498 NODE_NAME_CASE(VFMULC)
35499 NODE_NAME_CASE(VFMULC_RND)
35500 NODE_NAME_CASE(VFCMULC)
35501 NODE_NAME_CASE(VFCMULC_RND)
35502 NODE_NAME_CASE(VFMULCSH)
35503 NODE_NAME_CASE(VFMULCSH_RND)
35504 NODE_NAME_CASE(VFCMULCSH)
35505 NODE_NAME_CASE(VFCMULCSH_RND)
35506 NODE_NAME_CASE(VFMADDCSH)
35507 NODE_NAME_CASE(VFMADDCSH_RND)
35508 NODE_NAME_CASE(VFCMADDCSH)
35509 NODE_NAME_CASE(VFCMADDCSH_RND)
35510 NODE_NAME_CASE(VPMADD52H)
35511 NODE_NAME_CASE(VPMADD52L)
35512 NODE_NAME_CASE(VRNDSCALE)
35513 NODE_NAME_CASE(STRICT_VRNDSCALE)
35514 NODE_NAME_CASE(VRNDSCALE_SAE)
35515 NODE_NAME_CASE(VRNDSCALES)
35516 NODE_NAME_CASE(VRNDSCALES_SAE)
35517 NODE_NAME_CASE(VREDUCE)
35518 NODE_NAME_CASE(VREDUCE_SAE)
35519 NODE_NAME_CASE(VREDUCES)
35520 NODE_NAME_CASE(VREDUCES_SAE)
35521 NODE_NAME_CASE(VGETMANT)
35522 NODE_NAME_CASE(VGETMANT_SAE)
35523 NODE_NAME_CASE(VGETMANTS)
35524 NODE_NAME_CASE(VGETMANTS_SAE)
35525 NODE_NAME_CASE(PCMPESTR)
35526 NODE_NAME_CASE(PCMPISTR)
35527 NODE_NAME_CASE(XTEST)
35528 NODE_NAME_CASE(COMPRESS)
35529 NODE_NAME_CASE(EXPAND)
35530 NODE_NAME_CASE(SELECTS)
35531 NODE_NAME_CASE(ADDSUB)
35532 NODE_NAME_CASE(RCP14)
35533 NODE_NAME_CASE(RCP14S)
35534 NODE_NAME_CASE(RCP28)
35535 NODE_NAME_CASE(RCP28_SAE)
35536 NODE_NAME_CASE(RCP28S)
35537 NODE_NAME_CASE(RCP28S_SAE)
35538 NODE_NAME_CASE(EXP2)
35539 NODE_NAME_CASE(EXP2_SAE)
35540 NODE_NAME_CASE(RSQRT14)
35541 NODE_NAME_CASE(RSQRT14S)
35542 NODE_NAME_CASE(RSQRT28)
35543 NODE_NAME_CASE(RSQRT28_SAE)
35544 NODE_NAME_CASE(RSQRT28S)
35545 NODE_NAME_CASE(RSQRT28S_SAE)
35546 NODE_NAME_CASE(FADD_RND)
35547 NODE_NAME_CASE(FADDS)
35548 NODE_NAME_CASE(FADDS_RND)
35549 NODE_NAME_CASE(FSUB_RND)
35550 NODE_NAME_CASE(FSUBS)
35551 NODE_NAME_CASE(FSUBS_RND)
35552 NODE_NAME_CASE(FMUL_RND)
35553 NODE_NAME_CASE(FMULS)
35554 NODE_NAME_CASE(FMULS_RND)
35555 NODE_NAME_CASE(FDIV_RND)
35556 NODE_NAME_CASE(FDIVS)
35557 NODE_NAME_CASE(FDIVS_RND)
35558 NODE_NAME_CASE(FSQRT_RND)
35559 NODE_NAME_CASE(FSQRTS)
35560 NODE_NAME_CASE(FSQRTS_RND)
35561 NODE_NAME_CASE(FGETEXP)
35562 NODE_NAME_CASE(FGETEXP_SAE)
35563 NODE_NAME_CASE(FGETEXPS)
35564 NODE_NAME_CASE(FGETEXPS_SAE)
35565 NODE_NAME_CASE(SCALEF)
35566 NODE_NAME_CASE(SCALEF_RND)
35567 NODE_NAME_CASE(SCALEFS)
35568 NODE_NAME_CASE(SCALEFS_RND)
35569 NODE_NAME_CASE(MULHRS)
35570 NODE_NAME_CASE(SINT_TO_FP_RND)
35571 NODE_NAME_CASE(UINT_TO_FP_RND)
35572 NODE_NAME_CASE(CVTTP2SI)
35573 NODE_NAME_CASE(CVTTP2UI)
35574 NODE_NAME_CASE(STRICT_CVTTP2SI)
35575 NODE_NAME_CASE(STRICT_CVTTP2UI)
35576 NODE_NAME_CASE(MCVTTP2SI)
35577 NODE_NAME_CASE(MCVTTP2UI)
35578 NODE_NAME_CASE(CVTTP2SI_SAE)
35579 NODE_NAME_CASE(CVTTP2UI_SAE)
35580 NODE_NAME_CASE(CVTTS2SI)
35581 NODE_NAME_CASE(CVTTS2UI)
35582 NODE_NAME_CASE(CVTTS2SI_SAE)
35583 NODE_NAME_CASE(CVTTS2UI_SAE)
35584 NODE_NAME_CASE(CVTSI2P)
35585 NODE_NAME_CASE(CVTUI2P)
35586 NODE_NAME_CASE(STRICT_CVTSI2P)
35587 NODE_NAME_CASE(STRICT_CVTUI2P)
35588 NODE_NAME_CASE(MCVTSI2P)
35589 NODE_NAME_CASE(MCVTUI2P)
35590 NODE_NAME_CASE(VFPCLASS)
35591 NODE_NAME_CASE(VFPCLASSS)
35592 NODE_NAME_CASE(MULTISHIFT)
35593 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35594 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35595 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35596 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35597 NODE_NAME_CASE(CVTPS2PH)
35598 NODE_NAME_CASE(STRICT_CVTPS2PH)
35599 NODE_NAME_CASE(CVTPS2PH_SAE)
35600 NODE_NAME_CASE(MCVTPS2PH)
35601 NODE_NAME_CASE(MCVTPS2PH_SAE)
35602 NODE_NAME_CASE(CVTPH2PS)
35603 NODE_NAME_CASE(STRICT_CVTPH2PS)
35604 NODE_NAME_CASE(CVTPH2PS_SAE)
35605 NODE_NAME_CASE(CVTP2SI)
35606 NODE_NAME_CASE(CVTP2UI)
35607 NODE_NAME_CASE(MCVTP2SI)
35608 NODE_NAME_CASE(MCVTP2UI)
35609 NODE_NAME_CASE(CVTP2SI_RND)
35610 NODE_NAME_CASE(CVTP2UI_RND)
35611 NODE_NAME_CASE(CVTS2SI)
35612 NODE_NAME_CASE(CVTS2UI)
35613 NODE_NAME_CASE(CVTS2SI_RND)
35614 NODE_NAME_CASE(CVTS2UI_RND)
35615 NODE_NAME_CASE(CVTNE2PS2BF16)
35616 NODE_NAME_CASE(CVTNEPS2BF16)
35617 NODE_NAME_CASE(MCVTNEPS2BF16)
35618 NODE_NAME_CASE(DPBF16PS)
35619 NODE_NAME_CASE(LWPINS)
35620 NODE_NAME_CASE(MGATHER)
35621 NODE_NAME_CASE(MSCATTER)
35622 NODE_NAME_CASE(VPDPBUSD)
35623 NODE_NAME_CASE(VPDPBUSDS)
35624 NODE_NAME_CASE(VPDPWSSD)
35625 NODE_NAME_CASE(VPDPWSSDS)
35626 NODE_NAME_CASE(VPSHUFBITQMB)
35627 NODE_NAME_CASE(GF2P8MULB)
35628 NODE_NAME_CASE(GF2P8AFFINEQB)
35629 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35630 NODE_NAME_CASE(NT_CALL)
35631 NODE_NAME_CASE(NT_BRIND)
35632 NODE_NAME_CASE(UMWAIT)
35633 NODE_NAME_CASE(TPAUSE)
35634 NODE_NAME_CASE(ENQCMD)
35635 NODE_NAME_CASE(ENQCMDS)
35636 NODE_NAME_CASE(VP2INTERSECT)
35637 NODE_NAME_CASE(VPDPBSUD)
35638 NODE_NAME_CASE(VPDPBSUDS)
35639 NODE_NAME_CASE(VPDPBUUD)
35640 NODE_NAME_CASE(VPDPBUUDS)
35641 NODE_NAME_CASE(VPDPBSSD)
35642 NODE_NAME_CASE(VPDPBSSDS)
35643 NODE_NAME_CASE(AESENC128KL)
35644 NODE_NAME_CASE(AESDEC128KL)
35645 NODE_NAME_CASE(AESENC256KL)
35646 NODE_NAME_CASE(AESDEC256KL)
35647 NODE_NAME_CASE(AESENCWIDE128KL)
35648 NODE_NAME_CASE(AESDECWIDE128KL)
35649 NODE_NAME_CASE(AESENCWIDE256KL)
35650 NODE_NAME_CASE(AESDECWIDE256KL)
35651 NODE_NAME_CASE(CMPCCXADD)
35652 NODE_NAME_CASE(TESTUI)
35653 NODE_NAME_CASE(FP80_ADD)
35654 NODE_NAME_CASE(STRICT_FP80_ADD)
35655 }
35656 return nullptr;
35657#undef NODE_NAME_CASE
35658}
35659
35660/// Return true if the addressing mode represented by AM is legal for this
35661/// target, for a load/store of the specified type.
35662bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
35663 const AddrMode &AM, Type *Ty,
35664 unsigned AS,
35665 Instruction *I) const {
35666 // X86 supports extremely general addressing modes.
35667 CodeModel::Model M = getTargetMachine().getCodeModel();
35668
35669 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35670 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35671 return false;
35672
35673 if (AM.BaseGV) {
35674 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35675
35676 // If a reference to this global requires an extra load, we can't fold it.
35677 if (isGlobalStubReference(GVFlags))
35678 return false;
35679
35680 // If BaseGV requires a register for the PIC base, we cannot also have a
35681 // BaseReg specified.
35682 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35683 return false;
35684
35685 // If lower 4G is not available, then we must use rip-relative addressing.
35686 if ((M != CodeModel::Small || isPositionIndependent()) &&
35687 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35688 return false;
35689 }
35690
35691 switch (AM.Scale) {
35692 case 0:
35693 case 1:
35694 case 2:
35695 case 4:
35696 case 8:
35697 // These scales always work.
35698 break;
35699 case 3:
35700 case 5:
35701 case 9:
35702 // These scales are formed with basereg+scalereg. Only accept if there is
35703 // no basereg yet.
35704 if (AM.HasBaseReg)
35705 return false;
35706 break;
35707 default: // Other stuff never works.
35708 return false;
35709 }
35710
35711 return true;
35712}
35713
35714bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
35715 unsigned Bits = Ty->getScalarSizeInBits();
35716
35717 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
35718 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
35719 if (Subtarget.hasXOP() &&
35720 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
35721 return false;
35722
35723 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
35724 // shifts just as cheap as scalar ones.
35725 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
35726 return false;
35727
35728 // AVX512BW has shifts such as vpsllvw.
35729 if (Subtarget.hasBWI() && Bits == 16)
35730 return false;
35731
35732 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
35733 // fully general vector.
35734 return true;
35735}
35736
35737bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35738 switch (Opcode) {
35739 // These are non-commutative binops.
35740 // TODO: Add more X86ISD opcodes once we have test coverage.
35741 case X86ISD::ANDNP:
35742 case X86ISD::PCMPGT:
35743 case X86ISD::FMAX:
35744 case X86ISD::FMIN:
35745 case X86ISD::FANDN:
35746 case X86ISD::VPSHA:
35747 case X86ISD::VPSHL:
35748 case X86ISD::VSHLV:
35749 case X86ISD::VSRLV:
35750 case X86ISD::VSRAV:
35751 return true;
35752 }
35753
35754 return TargetLoweringBase::isBinOp(Opcode);
35755}
35756
35757bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35758 switch (Opcode) {
35759 // TODO: Add more X86ISD opcodes once we have test coverage.
35760 case X86ISD::PCMPEQ:
35761 case X86ISD::PMULDQ:
35762 case X86ISD::PMULUDQ:
35763 case X86ISD::FMAXC:
35764 case X86ISD::FMINC:
35765 case X86ISD::FAND:
35766 case X86ISD::FOR:
35767 case X86ISD::FXOR:
35768 return true;
35769 }
35770
35771 return TargetLoweringBase::isCommutativeBinOp(Opcode);
35772}
35773
35774bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
35775 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35776 return false;
35777 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35778 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35779 return NumBits1 > NumBits2;
35780}
35781
35782bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
35783 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35784 return false;
35785
35786 if (!isTypeLegal(EVT::getEVT(Ty1)))
35787 return false;
35788
35789 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35789, __extension__
__PRETTY_FUNCTION__))
;
35790
35791 // Assuming the caller doesn't have a zeroext or signext return parameter,
35792 // truncation all the way down to i1 is valid.
35793 return true;
35794}
35795
35796bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
35797 return isInt<32>(Imm);
35798}
35799
35800bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
35801 // Can also use sub to handle negated immediates.
35802 return isInt<32>(Imm);
35803}
35804
35805bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
35806 return isInt<32>(Imm);
35807}
35808
35809bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
35810 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35811 return false;
35812 unsigned NumBits1 = VT1.getSizeInBits();
35813 unsigned NumBits2 = VT2.getSizeInBits();
35814 return NumBits1 > NumBits2;
35815}
35816
35817bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
35818 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35819 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35820}
35821
35822bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
35823 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35824 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35825}
35826
35827bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
35828 EVT VT1 = Val.getValueType();
35829 if (isZExtFree(VT1, VT2))
35830 return true;
35831
35832 if (Val.getOpcode() != ISD::LOAD)
35833 return false;
35834
35835 if (!VT1.isSimple() || !VT1.isInteger() ||
35836 !VT2.isSimple() || !VT2.isInteger())
35837 return false;
35838
35839 switch (VT1.getSimpleVT().SimpleTy) {
35840 default: break;
35841 case MVT::i8:
35842 case MVT::i16:
35843 case MVT::i32:
35844 // X86 has 8, 16, and 32-bit zero-extending loads.
35845 return true;
35846 }
35847
35848 return false;
35849}
35850
35851bool X86TargetLowering::shouldSinkOperands(Instruction *I,
35852 SmallVectorImpl<Use *> &Ops) const {
35853 using namespace llvm::PatternMatch;
35854
35855 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
35856 if (!VTy)
35857 return false;
35858
35859 if (I->getOpcode() == Instruction::Mul &&
35860 VTy->getElementType()->isIntegerTy(64)) {
35861 for (auto &Op : I->operands()) {
35862 // Make sure we are not already sinking this operand
35863 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
35864 continue;
35865
35866 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
35867 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
35868 if (Subtarget.hasSSE41() &&
35869 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
35870 m_SpecificInt(32)))) {
35871 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
35872 Ops.push_back(&Op);
35873 } else if (Subtarget.hasSSE2() &&
35874 match(Op.get(),
35875 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
35876 Ops.push_back(&Op);
35877 }
35878 }
35879
35880 return !Ops.empty();
35881 }
35882
35883 // A uniform shift amount in a vector shift or funnel shift may be much
35884 // cheaper than a generic variable vector shift, so make that pattern visible
35885 // to SDAG by sinking the shuffle instruction next to the shift.
35886 int ShiftAmountOpNum = -1;
35887 if (I->isShift())
35888 ShiftAmountOpNum = 1;
35889 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
35890 if (II->getIntrinsicID() == Intrinsic::fshl ||
35891 II->getIntrinsicID() == Intrinsic::fshr)
35892 ShiftAmountOpNum = 2;
35893 }
35894
35895 if (ShiftAmountOpNum == -1)
35896 return false;
35897
35898 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
35899 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
35900 isVectorShiftByScalarCheap(I->getType())) {
35901 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
35902 return true;
35903 }
35904
35905 return false;
35906}
35907
35908bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
35909 if (!Subtarget.is64Bit())
35910 return false;
35911 return TargetLowering::shouldConvertPhiType(From, To);
35912}
35913
35914bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
35915 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35916 return false;
35917
35918 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35919
35920 // There is no extending load for vXi1.
35921 if (SrcVT.getScalarType() == MVT::i1)
35922 return false;
35923
35924 return true;
35925}
35926
35927bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
35928 EVT VT) const {
35929 if (!Subtarget.hasAnyFMA())
35930 return false;
35931
35932 VT = VT.getScalarType();
35933
35934 if (!VT.isSimple())
35935 return false;
35936
35937 switch (VT.getSimpleVT().SimpleTy) {
35938 case MVT::f16:
35939 return Subtarget.hasFP16();
35940 case MVT::f32:
35941 case MVT::f64:
35942 return true;
35943 default:
35944 break;
35945 }
35946
35947 return false;
35948}
35949
35950bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
35951 // i16 instructions are longer (0x66 prefix) and potentially slower.
35952 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35953}
35954
35955bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
35956 EVT VT) const {
35957 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35958 // benefit. The transform may also be profitable for scalar code.
35959 if (!Subtarget.hasAVX512())
35960 return false;
35961 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35962 return false;
35963 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35964 return false;
35965
35966 return true;
35967}
35968
35969/// Targets can use this to indicate that they only support *some*
35970/// VECTOR_SHUFFLE operations, those with specific masks.
35971/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35972/// are assumed to be legal.
35973bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
35974 if (!VT.isSimple())
35975 return false;
35976
35977 // Not for i1 vectors
35978 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35979 return false;
35980
35981 // Very little shuffling can be done for 64-bit vectors right now.
35982 if (VT.getSimpleVT().getSizeInBits() == 64)
35983 return false;
35984
35985 // We only care that the types being shuffled are legal. The lowering can
35986 // handle any possible shuffle mask that results.
35987 return isTypeLegal(VT.getSimpleVT());
35988}
35989
35990bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
35991 EVT VT) const {
35992 // Don't convert an 'and' into a shuffle that we don't directly support.
35993 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35994 if (!Subtarget.hasAVX2())
35995 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35996 return false;
35997
35998 // Just delegate to the generic legality, clear masks aren't special.
35999 return isShuffleMaskLegal(Mask, VT);
36000}
36001
36002bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
36003 // If the subtarget is using thunks, we need to not generate jump tables.
36004 if (Subtarget.useIndirectThunkBranches())
36005 return false;
36006
36007 // Otherwise, fallback on the generic logic.
36008 return TargetLowering::areJTsAllowed(Fn);
36009}
36010
36011MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
36012 EVT ConditionVT) const {
36013 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
36014 // zero-extensions.
36015 if (ConditionVT.getSizeInBits() < 32)
36016 return MVT::i32;
36017 return TargetLoweringBase::getPreferredSwitchConditionType(Context,
36018 ConditionVT);
36019}
36020
36021//===----------------------------------------------------------------------===//
36022// X86 Scheduler Hooks
36023//===----------------------------------------------------------------------===//
36024
36025// Returns true if EFLAG is consumed after this iterator in the rest of the
36026// basic block or any successors of the basic block.
36027static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
36028 MachineBasicBlock *BB) {
36029 // Scan forward through BB for a use/def of EFLAGS.
36030 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
36031 if (mi.readsRegister(X86::EFLAGS))
36032 return true;
36033 // If we found a def, we can stop searching.
36034 if (mi.definesRegister(X86::EFLAGS))
36035 return false;
36036 }
36037
36038 // If we hit the end of the block, check whether EFLAGS is live into a
36039 // successor.
36040 for (MachineBasicBlock *Succ : BB->successors())
36041 if (Succ->isLiveIn(X86::EFLAGS))
36042 return true;
36043
36044 return false;
36045}
36046
36047/// Utility function to emit xbegin specifying the start of an RTM region.
36048static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
36049 const TargetInstrInfo *TII) {
36050 const DebugLoc &DL = MI.getDebugLoc();
36051
36052 const BasicBlock *BB = MBB->getBasicBlock();
36053 MachineFunction::iterator I = ++MBB->getIterator();
36054
36055 // For the v = xbegin(), we generate
36056 //
36057 // thisMBB:
36058 // xbegin sinkMBB
36059 //
36060 // mainMBB:
36061 // s0 = -1
36062 //
36063 // fallBB:
36064 // eax = # XABORT_DEF
36065 // s1 = eax
36066 //
36067 // sinkMBB:
36068 // v = phi(s0/mainBB, s1/fallBB)
36069
36070 MachineBasicBlock *thisMBB = MBB;
36071 MachineFunction *MF = MBB->getParent();
36072 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36073 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36074 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36075 MF->insert(I, mainMBB);
36076 MF->insert(I, fallMBB);
36077 MF->insert(I, sinkMBB);
36078
36079 if (isEFLAGSLiveAfter(MI, MBB)) {
36080 mainMBB->addLiveIn(X86::EFLAGS);
36081 fallMBB->addLiveIn(X86::EFLAGS);
36082 sinkMBB->addLiveIn(X86::EFLAGS);
36083 }
36084
36085 // Transfer the remainder of BB and its successor edges to sinkMBB.
36086 sinkMBB->splice(sinkMBB->begin(), MBB,
36087 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36088 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36089
36090 MachineRegisterInfo &MRI = MF->getRegInfo();
36091 Register DstReg = MI.getOperand(0).getReg();
36092 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
36093 Register mainDstReg = MRI.createVirtualRegister(RC);
36094 Register fallDstReg = MRI.createVirtualRegister(RC);
36095
36096 // thisMBB:
36097 // xbegin fallMBB
36098 // # fallthrough to mainMBB
36099 // # abortion to fallMBB
36100 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
36101 thisMBB->addSuccessor(mainMBB);
36102 thisMBB->addSuccessor(fallMBB);
36103
36104 // mainMBB:
36105 // mainDstReg := -1
36106 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
36107 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36108 mainMBB->addSuccessor(sinkMBB);
36109
36110 // fallMBB:
36111 // ; pseudo instruction to model hardware's definition from XABORT
36112 // EAX := XABORT_DEF
36113 // fallDstReg := EAX
36114 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
36115 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
36116 .addReg(X86::EAX);
36117 fallMBB->addSuccessor(sinkMBB);
36118
36119 // sinkMBB:
36120 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
36121 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
36122 .addReg(mainDstReg).addMBB(mainMBB)
36123 .addReg(fallDstReg).addMBB(fallMBB);
36124
36125 MI.eraseFromParent();
36126 return sinkMBB;
36127}
36128
36129MachineBasicBlock *
36130X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
36131 MachineBasicBlock *MBB) const {
36132 // Emit va_arg instruction on X86-64.
36133
36134 // Operands to this pseudo-instruction:
36135 // 0 ) Output : destination address (reg)
36136 // 1-5) Input : va_list address (addr, i64mem)
36137 // 6 ) ArgSize : Size (in bytes) of vararg type
36138 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
36139 // 8 ) Align : Alignment of type
36140 // 9 ) EFLAGS (implicit-def)
36141
36142 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36142, __extension__
__PRETTY_FUNCTION__))
;
36143 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
36144
36145 Register DestReg = MI.getOperand(0).getReg();
36146 MachineOperand &Base = MI.getOperand(1);
36147 MachineOperand &Scale = MI.getOperand(2);
36148 MachineOperand &Index = MI.getOperand(3);
36149 MachineOperand &Disp = MI.getOperand(4);
36150 MachineOperand &Segment = MI.getOperand(5);
36151 unsigned ArgSize = MI.getOperand(6).getImm();
36152 unsigned ArgMode = MI.getOperand(7).getImm();
36153 Align Alignment = Align(MI.getOperand(8).getImm());
36154
36155 MachineFunction *MF = MBB->getParent();
36156
36157 // Memory Reference
36158 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36158, __extension__
__PRETTY_FUNCTION__))
;
36159
36160 MachineMemOperand *OldMMO = MI.memoperands().front();
36161
36162 // Clone the MMO into two separate MMOs for loading and storing
36163 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
36164 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
36165 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
36166 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
36167
36168 // Machine Information
36169 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36170 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
36171 const TargetRegisterClass *AddrRegClass =
36172 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
36173 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
36174 const DebugLoc &DL = MI.getDebugLoc();
36175
36176 // struct va_list {
36177 // i32 gp_offset
36178 // i32 fp_offset
36179 // i64 overflow_area (address)
36180 // i64 reg_save_area (address)
36181 // }
36182 // sizeof(va_list) = 24
36183 // alignment(va_list) = 8
36184
36185 unsigned TotalNumIntRegs = 6;
36186 unsigned TotalNumXMMRegs = 8;
36187 bool UseGPOffset = (ArgMode == 1);
36188 bool UseFPOffset = (ArgMode == 2);
36189 unsigned MaxOffset = TotalNumIntRegs * 8 +
36190 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
36191
36192 /* Align ArgSize to a multiple of 8 */
36193 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
36194 bool NeedsAlign = (Alignment > 8);
36195
36196 MachineBasicBlock *thisMBB = MBB;
36197 MachineBasicBlock *overflowMBB;
36198 MachineBasicBlock *offsetMBB;
36199 MachineBasicBlock *endMBB;
36200
36201 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
36202 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
36203 unsigned OffsetReg = 0;
36204
36205 if (!UseGPOffset && !UseFPOffset) {
36206 // If we only pull from the overflow region, we don't create a branch.
36207 // We don't need to alter control flow.
36208 OffsetDestReg = 0; // unused
36209 OverflowDestReg = DestReg;
36210
36211 offsetMBB = nullptr;
36212 overflowMBB = thisMBB;
36213 endMBB = thisMBB;
36214 } else {
36215 // First emit code to check if gp_offset (or fp_offset) is below the bound.
36216 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
36217 // If not, pull from overflow_area. (branch to overflowMBB)
36218 //
36219 // thisMBB
36220 // | .
36221 // | .
36222 // offsetMBB overflowMBB
36223 // | .
36224 // | .
36225 // endMBB
36226
36227 // Registers for the PHI in endMBB
36228 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
36229 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
36230
36231 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36232 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36233 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36234 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36235
36236 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36237
36238 // Insert the new basic blocks
36239 MF->insert(MBBIter, offsetMBB);
36240 MF->insert(MBBIter, overflowMBB);
36241 MF->insert(MBBIter, endMBB);
36242
36243 // Transfer the remainder of MBB and its successor edges to endMBB.
36244 endMBB->splice(endMBB->begin(), thisMBB,
36245 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
36246 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
36247
36248 // Make offsetMBB and overflowMBB successors of thisMBB
36249 thisMBB->addSuccessor(offsetMBB);
36250 thisMBB->addSuccessor(overflowMBB);
36251
36252 // endMBB is a successor of both offsetMBB and overflowMBB
36253 offsetMBB->addSuccessor(endMBB);
36254 overflowMBB->addSuccessor(endMBB);
36255
36256 // Load the offset value into a register
36257 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36258 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
36259 .add(Base)
36260 .add(Scale)
36261 .add(Index)
36262 .addDisp(Disp, UseFPOffset ? 4 : 0)
36263 .add(Segment)
36264 .setMemRefs(LoadOnlyMMO);
36265
36266 // Check if there is enough room left to pull this argument.
36267 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
36268 .addReg(OffsetReg)
36269 .addImm(MaxOffset + 8 - ArgSizeA8);
36270
36271 // Branch to "overflowMBB" if offset >= max
36272 // Fall through to "offsetMBB" otherwise
36273 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
36274 .addMBB(overflowMBB).addImm(X86::COND_AE);
36275 }
36276
36277 // In offsetMBB, emit code to use the reg_save_area.
36278 if (offsetMBB) {
36279 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36279, __extension__ __PRETTY_FUNCTION__))
;
36280
36281 // Read the reg_save_area address.
36282 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
36283 BuildMI(
36284 offsetMBB, DL,
36285 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36286 RegSaveReg)
36287 .add(Base)
36288 .add(Scale)
36289 .add(Index)
36290 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
36291 .add(Segment)
36292 .setMemRefs(LoadOnlyMMO);
36293
36294 if (Subtarget.isTarget64BitLP64()) {
36295 // Zero-extend the offset
36296 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
36297 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
36298 .addImm(0)
36299 .addReg(OffsetReg)
36300 .addImm(X86::sub_32bit);
36301
36302 // Add the offset to the reg_save_area to get the final address.
36303 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
36304 .addReg(OffsetReg64)
36305 .addReg(RegSaveReg);
36306 } else {
36307 // Add the offset to the reg_save_area to get the final address.
36308 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
36309 .addReg(OffsetReg)
36310 .addReg(RegSaveReg);
36311 }
36312
36313 // Compute the offset for the next argument
36314 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36315 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
36316 .addReg(OffsetReg)
36317 .addImm(UseFPOffset ? 16 : 8);
36318
36319 // Store it back into the va_list.
36320 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
36321 .add(Base)
36322 .add(Scale)
36323 .add(Index)
36324 .addDisp(Disp, UseFPOffset ? 4 : 0)
36325 .add(Segment)
36326 .addReg(NextOffsetReg)
36327 .setMemRefs(StoreOnlyMMO);
36328
36329 // Jump to endMBB
36330 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
36331 .addMBB(endMBB);
36332 }
36333
36334 //
36335 // Emit code to use overflow area
36336 //
36337
36338 // Load the overflow_area address into a register.
36339 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36340 BuildMI(overflowMBB, DL,
36341 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36342 OverflowAddrReg)
36343 .add(Base)
36344 .add(Scale)
36345 .add(Index)
36346 .addDisp(Disp, 8)
36347 .add(Segment)
36348 .setMemRefs(LoadOnlyMMO);
36349
36350 // If we need to align it, do so. Otherwise, just copy the address
36351 // to OverflowDestReg.
36352 if (NeedsAlign) {
36353 // Align the overflow address
36354 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36355
36356 // aligned_addr = (addr + (align-1)) & ~(align-1)
36357 BuildMI(
36358 overflowMBB, DL,
36359 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36360 TmpReg)
36361 .addReg(OverflowAddrReg)
36362 .addImm(Alignment.value() - 1);
36363
36364 BuildMI(
36365 overflowMBB, DL,
36366 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36367 OverflowDestReg)
36368 .addReg(TmpReg)
36369 .addImm(~(uint64_t)(Alignment.value() - 1));
36370 } else {
36371 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
36372 .addReg(OverflowAddrReg);
36373 }
36374
36375 // Compute the next overflow address after this argument.
36376 // (the overflow address should be kept 8-byte aligned)
36377 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36378 BuildMI(
36379 overflowMBB, DL,
36380 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36381 NextAddrReg)
36382 .addReg(OverflowDestReg)
36383 .addImm(ArgSizeA8);
36384
36385 // Store the new overflow address.
36386 BuildMI(overflowMBB, DL,
36387 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36388 .add(Base)
36389 .add(Scale)
36390 .add(Index)
36391 .addDisp(Disp, 8)
36392 .add(Segment)
36393 .addReg(NextAddrReg)
36394 .setMemRefs(StoreOnlyMMO);
36395
36396 // If we branched, emit the PHI to the front of endMBB.
36397 if (offsetMBB) {
36398 BuildMI(*endMBB, endMBB->begin(), DL,
36399 TII->get(X86::PHI), DestReg)
36400 .addReg(OffsetDestReg).addMBB(offsetMBB)
36401 .addReg(OverflowDestReg).addMBB(overflowMBB);
36402 }
36403
36404 // Erase the pseudo instruction
36405 MI.eraseFromParent();
36406
36407 return endMBB;
36408}
36409
36410// The EFLAGS operand of SelectItr might be missing a kill marker
36411// because there were multiple uses of EFLAGS, and ISel didn't know
36412// which to mark. Figure out whether SelectItr should have had a
36413// kill marker, and set it if it should. Returns the correct kill
36414// marker value.
36415static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
36416 MachineBasicBlock* BB,
36417 const TargetRegisterInfo* TRI) {
36418 if (isEFLAGSLiveAfter(SelectItr, BB))
36419 return false;
36420
36421 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36422 // out. SelectMI should have a kill flag on EFLAGS.
36423 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36424 return true;
36425}
36426
36427// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36428// together with other CMOV pseudo-opcodes into a single basic-block with
36429// conditional jump around it.
36430static bool isCMOVPseudo(MachineInstr &MI) {
36431 switch (MI.getOpcode()) {
36432 case X86::CMOV_FR16:
36433 case X86::CMOV_FR16X:
36434 case X86::CMOV_FR32:
36435 case X86::CMOV_FR32X:
36436 case X86::CMOV_FR64:
36437 case X86::CMOV_FR64X:
36438 case X86::CMOV_GR8:
36439 case X86::CMOV_GR16:
36440 case X86::CMOV_GR32:
36441 case X86::CMOV_RFP32:
36442 case X86::CMOV_RFP64:
36443 case X86::CMOV_RFP80:
36444 case X86::CMOV_VR64:
36445 case X86::CMOV_VR128:
36446 case X86::CMOV_VR128X:
36447 case X86::CMOV_VR256:
36448 case X86::CMOV_VR256X:
36449 case X86::CMOV_VR512:
36450 case X86::CMOV_VK1:
36451 case X86::CMOV_VK2:
36452 case X86::CMOV_VK4:
36453 case X86::CMOV_VK8:
36454 case X86::CMOV_VK16:
36455 case X86::CMOV_VK32:
36456 case X86::CMOV_VK64:
36457 return true;
36458
36459 default:
36460 return false;
36461 }
36462}
36463
36464// Helper function, which inserts PHI functions into SinkMBB:
36465// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36466// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36467// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36468// the last PHI function inserted.
36469static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
36470 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
36471 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36472 MachineBasicBlock *SinkMBB) {
36473 MachineFunction *MF = TrueMBB->getParent();
36474 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
36475 const DebugLoc &DL = MIItBegin->getDebugLoc();
36476
36477 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36478 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36479
36480 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36481
36482 // As we are creating the PHIs, we have to be careful if there is more than
36483 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36484 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36485 // That also means that PHI construction must work forward from earlier to
36486 // later, and that the code must maintain a mapping from earlier PHI's
36487 // destination registers, and the registers that went into the PHI.
36488 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
36489 MachineInstrBuilder MIB;
36490
36491 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36492 Register DestReg = MIIt->getOperand(0).getReg();
36493 Register Op1Reg = MIIt->getOperand(1).getReg();
36494 Register Op2Reg = MIIt->getOperand(2).getReg();
36495
36496 // If this CMOV we are generating is the opposite condition from
36497 // the jump we generated, then we have to swap the operands for the
36498 // PHI that is going to be generated.
36499 if (MIIt->getOperand(3).getImm() == OppCC)
36500 std::swap(Op1Reg, Op2Reg);
36501
36502 if (RegRewriteTable.contains(Op1Reg))
36503 Op1Reg = RegRewriteTable[Op1Reg].first;
36504
36505 if (RegRewriteTable.contains(Op2Reg))
36506 Op2Reg = RegRewriteTable[Op2Reg].second;
36507
36508 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
36509 .addReg(Op1Reg)
36510 .addMBB(FalseMBB)
36511 .addReg(Op2Reg)
36512 .addMBB(TrueMBB);
36513
36514 // Add this PHI to the rewrite table.
36515 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36516 }
36517
36518 return MIB;
36519}
36520
36521// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36522MachineBasicBlock *
36523X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36524 MachineInstr &SecondCascadedCMOV,
36525 MachineBasicBlock *ThisMBB) const {
36526 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36527 const DebugLoc &DL = FirstCMOV.getDebugLoc();
36528
36529 // We lower cascaded CMOVs such as
36530 //
36531 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36532 //
36533 // to two successive branches.
36534 //
36535 // Without this, we would add a PHI between the two jumps, which ends up
36536 // creating a few copies all around. For instance, for
36537 //
36538 // (sitofp (zext (fcmp une)))
36539 //
36540 // we would generate:
36541 //
36542 // ucomiss %xmm1, %xmm0
36543 // movss <1.0f>, %xmm0
36544 // movaps %xmm0, %xmm1
36545 // jne .LBB5_2
36546 // xorps %xmm1, %xmm1
36547 // .LBB5_2:
36548 // jp .LBB5_4
36549 // movaps %xmm1, %xmm0
36550 // .LBB5_4:
36551 // retq
36552 //
36553 // because this custom-inserter would have generated:
36554 //
36555 // A
36556 // | \
36557 // | B
36558 // | /
36559 // C
36560 // | \
36561 // | D
36562 // | /
36563 // E
36564 //
36565 // A: X = ...; Y = ...
36566 // B: empty
36567 // C: Z = PHI [X, A], [Y, B]
36568 // D: empty
36569 // E: PHI [X, C], [Z, D]
36570 //
36571 // If we lower both CMOVs in a single step, we can instead generate:
36572 //
36573 // A
36574 // | \
36575 // | C
36576 // | /|
36577 // |/ |
36578 // | |
36579 // | D
36580 // | /
36581 // E
36582 //
36583 // A: X = ...; Y = ...
36584 // D: empty
36585 // E: PHI [X, A], [X, C], [Y, D]
36586 //
36587 // Which, in our sitofp/fcmp example, gives us something like:
36588 //
36589 // ucomiss %xmm1, %xmm0
36590 // movss <1.0f>, %xmm0
36591 // jne .LBB5_4
36592 // jp .LBB5_4
36593 // xorps %xmm0, %xmm0
36594 // .LBB5_4:
36595 // retq
36596 //
36597
36598 // We lower cascaded CMOV into two successive branches to the same block.
36599 // EFLAGS is used by both, so mark it as live in the second.
36600 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36601 MachineFunction *F = ThisMBB->getParent();
36602 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36603 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36604 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36605
36606 MachineFunction::iterator It = ++ThisMBB->getIterator();
36607 F->insert(It, FirstInsertedMBB);
36608 F->insert(It, SecondInsertedMBB);
36609 F->insert(It, SinkMBB);
36610
36611 // For a cascaded CMOV, we lower it to two successive branches to
36612 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36613 // the FirstInsertedMBB.
36614 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36615
36616 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36617 // live into the sink and copy blocks.
36618 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36619 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
36620 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36621 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36622 SinkMBB->addLiveIn(X86::EFLAGS);
36623 }
36624
36625 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36626 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36627 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36628 ThisMBB->end());
36629 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36630
36631 // Fallthrough block for ThisMBB.
36632 ThisMBB->addSuccessor(FirstInsertedMBB);
36633 // The true block target of the first branch is always SinkMBB.
36634 ThisMBB->addSuccessor(SinkMBB);
36635 // Fallthrough block for FirstInsertedMBB.
36636 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36637 // The true block for the branch of FirstInsertedMBB.
36638 FirstInsertedMBB->addSuccessor(SinkMBB);
36639 // This is fallthrough.
36640 SecondInsertedMBB->addSuccessor(SinkMBB);
36641
36642 // Create the conditional branch instructions.
36643 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36644 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36645
36646 X86::CondCode SecondCC =
36647 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36648 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
36649
36650 // SinkMBB:
36651 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36652 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36653 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36654 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36655 MachineInstrBuilder MIB =
36656 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
36657 .addReg(Op1Reg)
36658 .addMBB(SecondInsertedMBB)
36659 .addReg(Op2Reg)
36660 .addMBB(ThisMBB);
36661
36662 // The second SecondInsertedMBB provides the same incoming value as the
36663 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36664 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36665
36666 // Now remove the CMOVs.
36667 FirstCMOV.eraseFromParent();
36668 SecondCascadedCMOV.eraseFromParent();
36669
36670 return SinkMBB;
36671}
36672
36673MachineBasicBlock *
36674X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36675 MachineBasicBlock *ThisMBB) const {
36676 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36677 const DebugLoc &DL = MI.getDebugLoc();
36678
36679 // To "insert" a SELECT_CC instruction, we actually have to insert the
36680 // diamond control-flow pattern. The incoming instruction knows the
36681 // destination vreg to set, the condition code register to branch on, the
36682 // true/false values to select between and a branch opcode to use.
36683
36684 // ThisMBB:
36685 // ...
36686 // TrueVal = ...
36687 // cmpTY ccX, r1, r2
36688 // bCC copy1MBB
36689 // fallthrough --> FalseMBB
36690
36691 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36692 // as described above, by inserting a BB, and then making a PHI at the join
36693 // point to select the true and false operands of the CMOV in the PHI.
36694 //
36695 // The code also handles two different cases of multiple CMOV opcodes
36696 // in a row.
36697 //
36698 // Case 1:
36699 // In this case, there are multiple CMOVs in a row, all which are based on
36700 // the same condition setting (or the exact opposite condition setting).
36701 // In this case we can lower all the CMOVs using a single inserted BB, and
36702 // then make a number of PHIs at the join point to model the CMOVs. The only
36703 // trickiness here, is that in a case like:
36704 //
36705 // t2 = CMOV cond1 t1, f1
36706 // t3 = CMOV cond1 t2, f2
36707 //
36708 // when rewriting this into PHIs, we have to perform some renaming on the
36709 // temps since you cannot have a PHI operand refer to a PHI result earlier
36710 // in the same block. The "simple" but wrong lowering would be:
36711 //
36712 // t2 = PHI t1(BB1), f1(BB2)
36713 // t3 = PHI t2(BB1), f2(BB2)
36714 //
36715 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36716 // renaming is to note that on the path through BB1, t2 is really just a
36717 // copy of t1, and do that renaming, properly generating:
36718 //
36719 // t2 = PHI t1(BB1), f1(BB2)
36720 // t3 = PHI t1(BB1), f2(BB2)
36721 //
36722 // Case 2:
36723 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36724 // function - EmitLoweredCascadedSelect.
36725
36726 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36727 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36728 MachineInstr *LastCMOV = &MI;
36729 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
36730
36731 // Check for case 1, where there are multiple CMOVs with the same condition
36732 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36733 // number of jumps the most.
36734
36735 if (isCMOVPseudo(MI)) {
36736 // See if we have a string of CMOVS with the same condition. Skip over
36737 // intervening debug insts.
36738 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36739 (NextMIIt->getOperand(3).getImm() == CC ||
36740 NextMIIt->getOperand(3).getImm() == OppCC)) {
36741 LastCMOV = &*NextMIIt;
36742 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36743 }
36744 }
36745
36746 // This checks for case 2, but only do this if we didn't already find
36747 // case 1, as indicated by LastCMOV == MI.
36748 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36749 NextMIIt->getOpcode() == MI.getOpcode() &&
36750 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36751 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36752 NextMIIt->getOperand(1).isKill()) {
36753 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36754 }
36755
36756 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36757 MachineFunction *F = ThisMBB->getParent();
36758 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36759 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36760
36761 MachineFunction::iterator It = ++ThisMBB->getIterator();
36762 F->insert(It, FalseMBB);
36763 F->insert(It, SinkMBB);
36764
36765 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36766 // live into the sink and copy blocks.
36767 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36768 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
36769 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36770 FalseMBB->addLiveIn(X86::EFLAGS);
36771 SinkMBB->addLiveIn(X86::EFLAGS);
36772 }
36773
36774 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36775 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
36776 MachineBasicBlock::iterator(LastCMOV));
36777 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36778 if (MI.isDebugInstr())
36779 SinkMBB->push_back(MI.removeFromParent());
36780
36781 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36782 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36783 std::next(MachineBasicBlock::iterator(LastCMOV)),
36784 ThisMBB->end());
36785 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36786
36787 // Fallthrough block for ThisMBB.
36788 ThisMBB->addSuccessor(FalseMBB);
36789 // The true block target of the first (or only) branch is always a SinkMBB.
36790 ThisMBB->addSuccessor(SinkMBB);
36791 // Fallthrough block for FalseMBB.
36792 FalseMBB->addSuccessor(SinkMBB);
36793
36794 // Create the conditional branch instruction.
36795 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36796
36797 // SinkMBB:
36798 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36799 // ...
36800 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
36801 MachineBasicBlock::iterator MIItEnd =
36802 std::next(MachineBasicBlock::iterator(LastCMOV));
36803 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36804
36805 // Now remove the CMOV(s).
36806 ThisMBB->erase(MIItBegin, MIItEnd);
36807
36808 return SinkMBB;
36809}
36810
36811static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
36812 if (IsLP64) {
36813 if (isInt<8>(Imm))
36814 return X86::SUB64ri8;
36815 return X86::SUB64ri32;
36816 } else {
36817 if (isInt<8>(Imm))
36818 return X86::SUB32ri8;
36819 return X86::SUB32ri;
36820 }
36821}
36822
36823MachineBasicBlock *
36824X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36825 MachineBasicBlock *MBB) const {
36826 MachineFunction *MF = MBB->getParent();
36827 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36828 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36829 const DebugLoc &DL = MI.getDebugLoc();
36830 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36831
36832 const unsigned ProbeSize = getStackProbeSize(*MF);
36833
36834 MachineRegisterInfo &MRI = MF->getRegInfo();
36835 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36836 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36837 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36838
36839 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36840 MF->insert(MBBIter, testMBB);
36841 MF->insert(MBBIter, blockMBB);
36842 MF->insert(MBBIter, tailMBB);
36843
36844 Register sizeVReg = MI.getOperand(1).getReg();
36845
36846 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36847
36848 Register TmpStackPtr = MRI.createVirtualRegister(
36849 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36850 Register FinalStackPtr = MRI.createVirtualRegister(
36851 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36852
36853 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
36854 .addReg(physSPReg);
36855 {
36856 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36857 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
36858 .addReg(TmpStackPtr)
36859 .addReg(sizeVReg);
36860 }
36861
36862 // test rsp size
36863
36864 BuildMI(testMBB, DL,
36865 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36866 .addReg(FinalStackPtr)
36867 .addReg(physSPReg);
36868
36869 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
36870 .addMBB(tailMBB)
36871 .addImm(X86::COND_GE);
36872 testMBB->addSuccessor(blockMBB);
36873 testMBB->addSuccessor(tailMBB);
36874
36875 // Touch the block then extend it. This is done on the opposite side of
36876 // static probe where we allocate then touch, to avoid the need of probing the
36877 // tail of the static alloca. Possible scenarios are:
36878 //
36879 // + ---- <- ------------ <- ------------- <- ------------ +
36880 // | |
36881 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36882 // | |
36883 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36884 //
36885 // The property we want to enforce is to never have more than [page alloc] between two probes.
36886
36887 const unsigned XORMIOpc =
36888 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
36889 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
36890 .addImm(0);
36891
36892 BuildMI(blockMBB, DL,
36893 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
36894 .addReg(physSPReg)
36895 .addImm(ProbeSize);
36896
36897
36898 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
36899 blockMBB->addSuccessor(testMBB);
36900
36901 // Replace original instruction by the expected stack ptr
36902 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
36903 .addReg(FinalStackPtr);
36904
36905 tailMBB->splice(tailMBB->end(), MBB,
36906 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36907 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
36908 MBB->addSuccessor(testMBB);
36909
36910 // Delete the original pseudo instruction.
36911 MI.eraseFromParent();
36912
36913 // And we're done.
36914 return tailMBB;
36915}
36916
36917MachineBasicBlock *
36918X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36919 MachineBasicBlock *BB) const {
36920 MachineFunction *MF = BB->getParent();
36921 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36922 const DebugLoc &DL = MI.getDebugLoc();
36923 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36924
36925 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36925, __extension__ __PRETTY_FUNCTION__))
;
36926
36927 const bool Is64Bit = Subtarget.is64Bit();
36928 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36929
36930 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36931 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36932
36933 // BB:
36934 // ... [Till the alloca]
36935 // If stacklet is not large enough, jump to mallocMBB
36936 //
36937 // bumpMBB:
36938 // Allocate by subtracting from RSP
36939 // Jump to continueMBB
36940 //
36941 // mallocMBB:
36942 // Allocate by call to runtime
36943 //
36944 // continueMBB:
36945 // ...
36946 // [rest of original BB]
36947 //
36948
36949 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36950 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36951 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36952
36953 MachineRegisterInfo &MRI = MF->getRegInfo();
36954 const TargetRegisterClass *AddrRegClass =
36955 getRegClassFor(getPointerTy(MF->getDataLayout()));
36956
36957 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36958 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36959 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36960 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36961 sizeVReg = MI.getOperand(1).getReg(),
36962 physSPReg =
36963 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36964
36965 MachineFunction::iterator MBBIter = ++BB->getIterator();
36966
36967 MF->insert(MBBIter, bumpMBB);
36968 MF->insert(MBBIter, mallocMBB);
36969 MF->insert(MBBIter, continueMBB);
36970
36971 continueMBB->splice(continueMBB->begin(), BB,
36972 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36973 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36974
36975 // Add code to the main basic block to check if the stack limit has been hit,
36976 // and if so, jump to mallocMBB otherwise to bumpMBB.
36977 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36978 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36979 .addReg(tmpSPVReg).addReg(sizeVReg);
36980 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36981 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36982 .addReg(SPLimitVReg);
36983 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36984
36985 // bumpMBB simply decreases the stack pointer, since we know the current
36986 // stacklet has enough space.
36987 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
36988 .addReg(SPLimitVReg);
36989 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36990 .addReg(SPLimitVReg);
36991 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36992
36993 // Calls into a routine in libgcc to allocate more space from the heap.
36994 const uint32_t *RegMask =
36995 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36996 if (IsLP64) {
36997 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
36998 .addReg(sizeVReg);
36999 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
37000 .addExternalSymbol("__morestack_allocate_stack_space")
37001 .addRegMask(RegMask)
37002 .addReg(X86::RDI, RegState::Implicit)
37003 .addReg(X86::RAX, RegState::ImplicitDefine);
37004 } else if (Is64Bit) {
37005 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
37006 .addReg(sizeVReg);
37007 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
37008 .addExternalSymbol("__morestack_allocate_stack_space")
37009 .addRegMask(RegMask)
37010 .addReg(X86::EDI, RegState::Implicit)
37011 .addReg(X86::EAX, RegState::ImplicitDefine);
37012 } else {
37013 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
37014 .addImm(12);
37015 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
37016 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
37017 .addExternalSymbol("__morestack_allocate_stack_space")
37018 .addRegMask(RegMask)
37019 .addReg(X86::EAX, RegState::ImplicitDefine);
37020 }
37021
37022 if (!Is64Bit)
37023 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
37024 .addImm(16);
37025
37026 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
37027 .addReg(IsLP64 ? X86::RAX : X86::EAX);
37028 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
37029
37030 // Set up the CFG correctly.
37031 BB->addSuccessor(bumpMBB);
37032 BB->addSuccessor(mallocMBB);
37033 mallocMBB->addSuccessor(continueMBB);
37034 bumpMBB->addSuccessor(continueMBB);
37035
37036 // Take care of the PHI nodes.
37037 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
37038 MI.getOperand(0).getReg())
37039 .addReg(mallocPtrVReg)
37040 .addMBB(mallocMBB)
37041 .addReg(bumpSPPtrVReg)
37042 .addMBB(bumpMBB);
37043
37044 // Delete the original pseudo instruction.
37045 MI.eraseFromParent();
37046
37047 // And we're done.
37048 return continueMBB;
37049}
37050
37051MachineBasicBlock *
37052X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
37053 MachineBasicBlock *BB) const {
37054 MachineFunction *MF = BB->getParent();
37055 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37056 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
37057 const DebugLoc &DL = MI.getDebugLoc();
37058
37059 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))
37060 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))
37061 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))
;
37062
37063 // Only 32-bit EH needs to worry about manually restoring stack pointers.
37064 if (!Subtarget.is32Bit())
37065 return BB;
37066
37067 // C++ EH creates a new target block to hold the restore code, and wires up
37068 // the new block to the return destination with a normal JMP_4.
37069 MachineBasicBlock *RestoreMBB =
37070 MF->CreateMachineBasicBlock(BB->getBasicBlock());
37071 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37071, __extension__ __PRETTY_FUNCTION__))
;
37072 MF->insert(std::next(BB->getIterator()), RestoreMBB);
37073 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
37074 BB->addSuccessor(RestoreMBB);
37075 MI.getOperand(0).setMBB(RestoreMBB);
37076
37077 // Marking this as an EH pad but not a funclet entry block causes PEI to
37078 // restore stack pointers in the block.
37079 RestoreMBB->setIsEHPad(true);
37080
37081 auto RestoreMBBI = RestoreMBB->begin();
37082 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
37083 return BB;
37084}
37085
37086MachineBasicBlock *
37087X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
37088 MachineBasicBlock *BB) const {
37089 // So, here we replace TLSADDR with the sequence:
37090 // adjust_stackdown -> TLSADDR -> adjust_stackup.
37091 // We need this because TLSADDR is lowered into calls
37092 // inside MC, therefore without the two markers shrink-wrapping
37093 // may push the prologue/epilogue pass them.
37094 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37095 const DebugLoc &DL = MI.getDebugLoc();
37096 MachineFunction &MF = *BB->getParent();
37097
37098 // Emit CALLSEQ_START right before the instruction.
37099 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37100 MachineInstrBuilder CallseqStart =
37101 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37102 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37103
37104 // Emit CALLSEQ_END right after the instruction.
37105 // We don't call erase from parent because we want to keep the
37106 // original instruction around.
37107 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37108 MachineInstrBuilder CallseqEnd =
37109 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
37110 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37111
37112 return BB;
37113}
37114
37115MachineBasicBlock *
37116X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
37117 MachineBasicBlock *BB) const {
37118 // This is pretty easy. We're taking the value that we received from
37119 // our load from the relocation, sticking it in either RDI (x86-64)
37120 // or EAX and doing an indirect call. The return value will then
37121 // be in the normal return register.
37122 MachineFunction *F = BB->getParent();
37123 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37124 const DebugLoc &DL = MI.getDebugLoc();
37125
37126 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37126, __extension__
__PRETTY_FUNCTION__))
;
37127 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37127, __extension__
__PRETTY_FUNCTION__))
;
37128
37129 // Get a register mask for the lowered call.
37130 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
37131 // proper register mask.
37132 const uint32_t *RegMask =
37133 Subtarget.is64Bit() ?
37134 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
37135 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
37136 if (Subtarget.is64Bit()) {
37137 MachineInstrBuilder MIB =
37138 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
37139 .addReg(X86::RIP)
37140 .addImm(0)
37141 .addReg(0)
37142 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37143 MI.getOperand(3).getTargetFlags())
37144 .addReg(0);
37145 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
37146 addDirectMem(MIB, X86::RDI);
37147 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
37148 } else if (!isPositionIndependent()) {
37149 MachineInstrBuilder MIB =
37150 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
37151 .addReg(0)
37152 .addImm(0)
37153 .addReg(0)
37154 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37155 MI.getOperand(3).getTargetFlags())
37156 .addReg(0);
37157 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
37158 addDirectMem(MIB, X86::EAX);
37159 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
37160 } else {
37161 MachineInstrBuilder MIB =
37162 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
37163 .addReg(TII->getGlobalBaseReg(F))
37164 .addImm(0)
37165 .addReg(0)
37166 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37167 MI.getOperand(3).getTargetFlags())
37168 .addReg(0);
37169 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
37170 addDirectMem(MIB, X86::EAX);
37171 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
37172 }
37173
37174 MI.eraseFromParent(); // The pseudo instruction is gone now.
37175 return BB;
37176}
37177
37178static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
37179 switch (RPOpc) {
37180 case X86::INDIRECT_THUNK_CALL32:
37181 return X86::CALLpcrel32;
37182 case X86::INDIRECT_THUNK_CALL64:
37183 return X86::CALL64pcrel32;
37184 case X86::INDIRECT_THUNK_TCRETURN32:
37185 return X86::TCRETURNdi;
37186 case X86::INDIRECT_THUNK_TCRETURN64:
37187 return X86::TCRETURNdi64;
37188 }
37189 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37189)
;
37190}
37191
37192static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
37193 unsigned Reg) {
37194 if (Subtarget.useRetpolineExternalThunk()) {
37195 // When using an external thunk for retpolines, we pick names that match the
37196 // names GCC happens to use as well. This helps simplify the implementation
37197 // of the thunks for kernels where they have no easy ability to create
37198 // aliases and are doing non-trivial configuration of the thunk's body. For
37199 // example, the Linux kernel will do boot-time hot patching of the thunk
37200 // bodies and cannot easily export aliases of these to loaded modules.
37201 //
37202 // Note that at any point in the future, we may need to change the semantics
37203 // of how we implement retpolines and at that time will likely change the
37204 // name of the called thunk. Essentially, there is no hard guarantee that
37205 // LLVM will generate calls to specific thunks, we merely make a best-effort
37206 // attempt to help out kernels and other systems where duplicating the
37207 // thunks is costly.
37208 switch (Reg) {
37209 case X86::EAX:
37210 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37210, __extension__
__PRETTY_FUNCTION__))
;
37211 return "__x86_indirect_thunk_eax";
37212 case X86::ECX:
37213 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37213, __extension__
__PRETTY_FUNCTION__))
;
37214 return "__x86_indirect_thunk_ecx";
37215 case X86::EDX:
37216 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37216, __extension__
__PRETTY_FUNCTION__))
;
37217 return "__x86_indirect_thunk_edx";
37218 case X86::EDI:
37219 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37219, __extension__
__PRETTY_FUNCTION__))
;
37220 return "__x86_indirect_thunk_edi";
37221 case X86::R11:
37222 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37222, __extension__
__PRETTY_FUNCTION__))
;
37223 return "__x86_indirect_thunk_r11";
37224 }
37225 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37225)
;
37226 }
37227
37228 if (Subtarget.useRetpolineIndirectCalls() ||
37229 Subtarget.useRetpolineIndirectBranches()) {
37230 // When targeting an internal COMDAT thunk use an LLVM-specific name.
37231 switch (Reg) {
37232 case X86::EAX:
37233 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37233, __extension__
__PRETTY_FUNCTION__))
;
37234 return "__llvm_retpoline_eax";
37235 case X86::ECX:
37236 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37236, __extension__
__PRETTY_FUNCTION__))
;
37237 return "__llvm_retpoline_ecx";
37238 case X86::EDX:
37239 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37239, __extension__
__PRETTY_FUNCTION__))
;
37240 return "__llvm_retpoline_edx";
37241 case X86::EDI:
37242 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37242, __extension__
__PRETTY_FUNCTION__))
;
37243 return "__llvm_retpoline_edi";
37244 case X86::R11:
37245 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37245, __extension__
__PRETTY_FUNCTION__))
;
37246 return "__llvm_retpoline_r11";
37247 }
37248 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37248)
;
37249 }
37250
37251 if (Subtarget.useLVIControlFlowIntegrity()) {
37252 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37252, __extension__
__PRETTY_FUNCTION__))
;
37253 return "__llvm_lvi_thunk_r11";
37254 }
37255 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37255)
;
37256}
37257
37258MachineBasicBlock *
37259X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
37260 MachineBasicBlock *BB) const {
37261 // Copy the virtual register into the R11 physical register and
37262 // call the retpoline thunk.
37263 const DebugLoc &DL = MI.getDebugLoc();
37264 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37265 Register CalleeVReg = MI.getOperand(0).getReg();
37266 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
37267
37268 // Find an available scratch register to hold the callee. On 64-bit, we can
37269 // just use R11, but we scan for uses anyway to ensure we don't generate
37270 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
37271 // already a register use operand to the call to hold the callee. If none
37272 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
37273 // register and ESI is the base pointer to realigned stack frames with VLAs.
37274 SmallVector<unsigned, 3> AvailableRegs;
37275 if (Subtarget.is64Bit())
37276 AvailableRegs.push_back(X86::R11);
37277 else
37278 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
37279
37280 // Zero out any registers that are already used.
37281 for (const auto &MO : MI.operands()) {
37282 if (MO.isReg() && MO.isUse())
37283 for (unsigned &Reg : AvailableRegs)
37284 if (Reg == MO.getReg())
37285 Reg = 0;
37286 }
37287
37288 // Choose the first remaining non-zero available register.
37289 unsigned AvailableReg = 0;
37290 for (unsigned MaybeReg : AvailableRegs) {
37291 if (MaybeReg) {
37292 AvailableReg = MaybeReg;
37293 break;
37294 }
37295 }
37296 if (!AvailableReg)
37297 report_fatal_error("calling convention incompatible with retpoline, no "
37298 "available registers");
37299
37300 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
37301
37302 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
37303 .addReg(CalleeVReg);
37304 MI.getOperand(0).ChangeToES(Symbol);
37305 MI.setDesc(TII->get(Opc));
37306 MachineInstrBuilder(*BB->getParent(), &MI)
37307 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
37308 return BB;
37309}
37310
37311/// SetJmp implies future control flow change upon calling the corresponding
37312/// LongJmp.
37313/// Instead of using the 'return' instruction, the long jump fixes the stack and
37314/// performs an indirect branch. To do so it uses the registers that were stored
37315/// in the jump buffer (when calling SetJmp).
37316/// In case the shadow stack is enabled we need to fix it as well, because some
37317/// return addresses will be skipped.
37318/// The function will save the SSP for future fixing in the function
37319/// emitLongJmpShadowStackFix.
37320/// \sa emitLongJmpShadowStackFix
37321/// \param [in] MI The temporary Machine Instruction for the builtin.
37322/// \param [in] MBB The Machine Basic Block that will be modified.
37323void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
37324 MachineBasicBlock *MBB) const {
37325 const DebugLoc &DL = MI.getDebugLoc();
37326 MachineFunction *MF = MBB->getParent();
37327 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37328 MachineRegisterInfo &MRI = MF->getRegInfo();
37329 MachineInstrBuilder MIB;
37330
37331 // Memory Reference.
37332 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37333 MI.memoperands_end());
37334
37335 // Initialize a register with zero.
37336 MVT PVT = getPointerTy(MF->getDataLayout());
37337 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37338 Register ZReg = MRI.createVirtualRegister(PtrRC);
37339 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37340 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
37341 .addDef(ZReg)
37342 .addReg(ZReg, RegState::Undef)
37343 .addReg(ZReg, RegState::Undef);
37344
37345 // Read the current SSP Register value to the zeroed register.
37346 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37347 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37348 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37349
37350 // Write the SSP register value to offset 3 in input memory buffer.
37351 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37352 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
37353 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37354 const unsigned MemOpndSlot = 1;
37355 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37356 if (i == X86::AddrDisp)
37357 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37358 else
37359 MIB.add(MI.getOperand(MemOpndSlot + i));
37360 }
37361 MIB.addReg(SSPCopyReg);
37362 MIB.setMemRefs(MMOs);
37363}
37364
37365MachineBasicBlock *
37366X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37367 MachineBasicBlock *MBB) const {
37368 const DebugLoc &DL = MI.getDebugLoc();
37369 MachineFunction *MF = MBB->getParent();
37370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37371 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37372 MachineRegisterInfo &MRI = MF->getRegInfo();
37373
37374 const BasicBlock *BB = MBB->getBasicBlock();
37375 MachineFunction::iterator I = ++MBB->getIterator();
37376
37377 // Memory Reference
37378 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37379 MI.memoperands_end());
37380
37381 unsigned DstReg;
37382 unsigned MemOpndSlot = 0;
37383
37384 unsigned CurOp = 0;
37385
37386 DstReg = MI.getOperand(CurOp++).getReg();
37387 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37388 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37388, __extension__
__PRETTY_FUNCTION__))
;
37389 (void)TRI;
37390 Register mainDstReg = MRI.createVirtualRegister(RC);
37391 Register restoreDstReg = MRI.createVirtualRegister(RC);
37392
37393 MemOpndSlot = CurOp;
37394
37395 MVT PVT = getPointerTy(MF->getDataLayout());
37396 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__))
37397 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__))
;
37398
37399 // For v = setjmp(buf), we generate
37400 //
37401 // thisMBB:
37402 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37403 // SjLjSetup restoreMBB
37404 //
37405 // mainMBB:
37406 // v_main = 0
37407 //
37408 // sinkMBB:
37409 // v = phi(main, restore)
37410 //
37411 // restoreMBB:
37412 // if base pointer being used, load it from frame
37413 // v_restore = 1
37414
37415 MachineBasicBlock *thisMBB = MBB;
37416 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37417 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37418 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37419 MF->insert(I, mainMBB);
37420 MF->insert(I, sinkMBB);
37421 MF->push_back(restoreMBB);
37422 restoreMBB->setMachineBlockAddressTaken();
37423
37424 MachineInstrBuilder MIB;
37425
37426 // Transfer the remainder of BB and its successor edges to sinkMBB.
37427 sinkMBB->splice(sinkMBB->begin(), MBB,
37428 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37429 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37430
37431 // thisMBB:
37432 unsigned PtrStoreOpc = 0;
37433 unsigned LabelReg = 0;
37434 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37435 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37436 !isPositionIndependent();
37437
37438 // Prepare IP either in reg or imm.
37439 if (!UseImmLabel) {
37440 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37441 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37442 LabelReg = MRI.createVirtualRegister(PtrRC);
37443 if (Subtarget.is64Bit()) {
37444 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
37445 .addReg(X86::RIP)
37446 .addImm(0)
37447 .addReg(0)
37448 .addMBB(restoreMBB)
37449 .addReg(0);
37450 } else {
37451 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37452 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
37453 .addReg(XII->getGlobalBaseReg(MF))
37454 .addImm(0)
37455 .addReg(0)
37456 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37457 .addReg(0);
37458 }
37459 } else
37460 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37461 // Store IP
37462 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
37463 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37464 if (i == X86::AddrDisp)
37465 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37466 else
37467 MIB.add(MI.getOperand(MemOpndSlot + i));
37468 }
37469 if (!UseImmLabel)
37470 MIB.addReg(LabelReg);
37471 else
37472 MIB.addMBB(restoreMBB);
37473 MIB.setMemRefs(MMOs);
37474
37475 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37476 emitSetJmpShadowStackFix(MI, thisMBB);
37477 }
37478
37479 // Setup
37480 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
37481 .addMBB(restoreMBB);
37482
37483 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37484 MIB.addRegMask(RegInfo->getNoPreservedMask());
37485 thisMBB->addSuccessor(mainMBB);
37486 thisMBB->addSuccessor(restoreMBB);
37487
37488 // mainMBB:
37489 // EAX = 0
37490 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
37491 mainMBB->addSuccessor(sinkMBB);
37492
37493 // sinkMBB:
37494 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
37495 TII->get(X86::PHI), DstReg)
37496 .addReg(mainDstReg).addMBB(mainMBB)
37497 .addReg(restoreDstReg).addMBB(restoreMBB);
37498
37499 // restoreMBB:
37500 if (RegInfo->hasBasePointer(*MF)) {
37501 const bool Uses64BitFramePtr =
37502 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37503 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37504 X86FI->setRestoreBasePointer(MF);
37505 Register FramePtr = RegInfo->getFrameRegister(*MF);
37506 Register BasePtr = RegInfo->getBaseRegister();
37507 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37508 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
37509 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37510 .setMIFlag(MachineInstr::FrameSetup);
37511 }
37512 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37513 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37514 restoreMBB->addSuccessor(sinkMBB);
37515
37516 MI.eraseFromParent();
37517 return sinkMBB;
37518}
37519
37520/// Fix the shadow stack using the previously saved SSP pointer.
37521/// \sa emitSetJmpShadowStackFix
37522/// \param [in] MI The temporary Machine Instruction for the builtin.
37523/// \param [in] MBB The Machine Basic Block that will be modified.
37524/// \return The sink MBB that will perform the future indirect branch.
37525MachineBasicBlock *
37526X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37527 MachineBasicBlock *MBB) const {
37528 const DebugLoc &DL = MI.getDebugLoc();
37529 MachineFunction *MF = MBB->getParent();
37530 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37531 MachineRegisterInfo &MRI = MF->getRegInfo();
37532
37533 // Memory Reference
37534 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37535 MI.memoperands_end());
37536
37537 MVT PVT = getPointerTy(MF->getDataLayout());
37538 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37539
37540 // checkSspMBB:
37541 // xor vreg1, vreg1
37542 // rdssp vreg1
37543 // test vreg1, vreg1
37544 // je sinkMBB # Jump if Shadow Stack is not supported
37545 // fallMBB:
37546 // mov buf+24/12(%rip), vreg2
37547 // sub vreg1, vreg2
37548 // jbe sinkMBB # No need to fix the Shadow Stack
37549 // fixShadowMBB:
37550 // shr 3/2, vreg2
37551 // incssp vreg2 # fix the SSP according to the lower 8 bits
37552 // shr 8, vreg2
37553 // je sinkMBB
37554 // fixShadowLoopPrepareMBB:
37555 // shl vreg2
37556 // mov 128, vreg3
37557 // fixShadowLoopMBB:
37558 // incssp vreg3
37559 // dec vreg2
37560 // jne fixShadowLoopMBB # Iterate until you finish fixing
37561 // # the Shadow Stack
37562 // sinkMBB:
37563
37564 MachineFunction::iterator I = ++MBB->getIterator();
37565 const BasicBlock *BB = MBB->getBasicBlock();
37566
37567 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37568 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37569 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37570 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37571 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37572 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37573 MF->insert(I, checkSspMBB);
37574 MF->insert(I, fallMBB);
37575 MF->insert(I, fixShadowMBB);
37576 MF->insert(I, fixShadowLoopPrepareMBB);
37577 MF->insert(I, fixShadowLoopMBB);
37578 MF->insert(I, sinkMBB);
37579
37580 // Transfer the remainder of BB and its successor edges to sinkMBB.
37581 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37582 MBB->end());
37583 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37584
37585 MBB->addSuccessor(checkSspMBB);
37586
37587 // Initialize a register with zero.
37588 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37589 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
37590
37591 if (PVT == MVT::i64) {
37592 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37593 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37594 .addImm(0)
37595 .addReg(ZReg)
37596 .addImm(X86::sub_32bit);
37597 ZReg = TmpZReg;
37598 }
37599
37600 // Read the current SSP Register value to the zeroed register.
37601 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37602 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37603 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37604
37605 // Check whether the result of the SSP register is zero and jump directly
37606 // to the sink.
37607 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37608 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
37609 .addReg(SSPCopyReg)
37610 .addReg(SSPCopyReg);
37611 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37612 checkSspMBB->addSuccessor(sinkMBB);
37613 checkSspMBB->addSuccessor(fallMBB);
37614
37615 // Reload the previously saved SSP register value.
37616 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37617 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37618 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37619 MachineInstrBuilder MIB =
37620 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
37621 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37622 const MachineOperand &MO = MI.getOperand(i);
37623 if (i == X86::AddrDisp)
37624 MIB.addDisp(MO, SPPOffset);
37625 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37626 // preserve kill flags.
37627 MIB.addReg(MO.getReg());
37628 else
37629 MIB.add(MO);
37630 }
37631 MIB.setMemRefs(MMOs);
37632
37633 // Subtract the current SSP from the previous SSP.
37634 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37635 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37636 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
37637 .addReg(PrevSSPReg)
37638 .addReg(SSPCopyReg);
37639
37640 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37641 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
37642 fallMBB->addSuccessor(sinkMBB);
37643 fallMBB->addSuccessor(fixShadowMBB);
37644
37645 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37646 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37647 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37648 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37649 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
37650 .addReg(SspSubReg)
37651 .addImm(Offset);
37652
37653 // Increase SSP when looking only on the lower 8 bits of the delta.
37654 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37655 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37656
37657 // Reset the lower 8 bits.
37658 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37659 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
37660 .addReg(SspFirstShrReg)
37661 .addImm(8);
37662
37663 // Jump if the result of the shift is zero.
37664 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37665 fixShadowMBB->addSuccessor(sinkMBB);
37666 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37667
37668 // Do a single shift left.
37669 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
37670 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37671 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
37672 .addReg(SspSecondShrReg);
37673
37674 // Save the value 128 to a register (will be used next with incssp).
37675 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37676 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37677 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
37678 .addImm(128);
37679 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37680
37681 // Since incssp only looks at the lower 8 bits, we might need to do several
37682 // iterations of incssp until we finish fixing the shadow stack.
37683 Register DecReg = MRI.createVirtualRegister(PtrRC);
37684 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37685 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
37686 .addReg(SspAfterShlReg)
37687 .addMBB(fixShadowLoopPrepareMBB)
37688 .addReg(DecReg)
37689 .addMBB(fixShadowLoopMBB);
37690
37691 // Every iteration we increase the SSP by 128.
37692 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
37693
37694 // Every iteration we decrement the counter by 1.
37695 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37696 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
37697
37698 // Jump if the counter is not zero yet.
37699 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
37700 fixShadowLoopMBB->addSuccessor(sinkMBB);
37701 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37702
37703 return sinkMBB;
37704}
37705
37706MachineBasicBlock *
37707X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37708 MachineBasicBlock *MBB) const {
37709 const DebugLoc &DL = MI.getDebugLoc();
37710 MachineFunction *MF = MBB->getParent();
37711 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37712 MachineRegisterInfo &MRI = MF->getRegInfo();
37713
37714 // Memory Reference
37715 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37716 MI.memoperands_end());
37717
37718 MVT PVT = getPointerTy(MF->getDataLayout());
37719 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__
__PRETTY_FUNCTION__))
37720 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__
__PRETTY_FUNCTION__))
;
37721
37722 const TargetRegisterClass *RC =
37723 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37724 Register Tmp = MRI.createVirtualRegister(RC);
37725 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37726 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37727 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37728 Register SP = RegInfo->getStackRegister();
37729
37730 MachineInstrBuilder MIB;
37731
37732 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37733 const int64_t SPOffset = 2 * PVT.getStoreSize();
37734
37735 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37736 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37737
37738 MachineBasicBlock *thisMBB = MBB;
37739
37740 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37741 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37742 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37743 }
37744
37745 // Reload FP
37746 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
37747 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37748 const MachineOperand &MO = MI.getOperand(i);
37749 if (MO.isReg()) // Don't add the whole operand, we don't want to
37750 // preserve kill flags.
37751 MIB.addReg(MO.getReg());
37752 else
37753 MIB.add(MO);
37754 }
37755 MIB.setMemRefs(MMOs);
37756
37757 // Reload IP
37758 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
37759 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37760 const MachineOperand &MO = MI.getOperand(i);
37761 if (i == X86::AddrDisp)
37762 MIB.addDisp(MO, LabelOffset);
37763 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37764 // preserve kill flags.
37765 MIB.addReg(MO.getReg());
37766 else
37767 MIB.add(MO);
37768 }
37769 MIB.setMemRefs(MMOs);
37770
37771 // Reload SP
37772 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
37773 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37774 if (i == X86::AddrDisp)
37775 MIB.addDisp(MI.getOperand(i), SPOffset);
37776 else
37777 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37778 // the last instruction of the expansion.
37779 }
37780 MIB.setMemRefs(MMOs);
37781
37782 // Jump
37783 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
37784
37785 MI.eraseFromParent();
37786 return thisMBB;
37787}
37788
37789void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37790 MachineBasicBlock *MBB,
37791 MachineBasicBlock *DispatchBB,
37792 int FI) const {
37793 const DebugLoc &DL = MI.getDebugLoc();
37794 MachineFunction *MF = MBB->getParent();
37795 MachineRegisterInfo *MRI = &MF->getRegInfo();
37796 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37797
37798 MVT PVT = getPointerTy(MF->getDataLayout());
37799 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37799, __extension__
__PRETTY_FUNCTION__))
;
37800
37801 unsigned Op = 0;
37802 unsigned VR = 0;
37803
37804 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37805 !isPositionIndependent();
37806
37807 if (UseImmLabel) {
37808 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37809 } else {
37810 const TargetRegisterClass *TRC =
37811 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37812 VR = MRI->createVirtualRegister(TRC);
37813 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37814
37815 if (Subtarget.is64Bit())
37816 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
37817 .addReg(X86::RIP)
37818 .addImm(1)
37819 .addReg(0)
37820 .addMBB(DispatchBB)
37821 .addReg(0);
37822 else
37823 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
37824 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37825 .addImm(1)
37826 .addReg(0)
37827 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37828 .addReg(0);
37829 }
37830
37831 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
37832 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37833 if (UseImmLabel)
37834 MIB.addMBB(DispatchBB);
37835 else
37836 MIB.addReg(VR);
37837}
37838
37839MachineBasicBlock *
37840X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37841 MachineBasicBlock *BB) const {
37842 const DebugLoc &DL = MI.getDebugLoc();
37843 MachineFunction *MF = BB->getParent();
37844 MachineRegisterInfo *MRI = &MF->getRegInfo();
37845 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37846 int FI = MF->getFrameInfo().getFunctionContextIndex();
37847
37848 // Get a mapping of the call site numbers to all of the landing pads they're
37849 // associated with.
37850 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37851 unsigned MaxCSNum = 0;
37852 for (auto &MBB : *MF) {
37853 if (!MBB.isEHPad())
37854 continue;
37855
37856 MCSymbol *Sym = nullptr;
37857 for (const auto &MI : MBB) {
37858 if (MI.isDebugInstr())
37859 continue;
37860
37861 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37861, __extension__
__PRETTY_FUNCTION__))
;
37862 Sym = MI.getOperand(0).getMCSymbol();
37863 break;
37864 }
37865
37866 if (!MF->hasCallSiteLandingPad(Sym))
37867 continue;
37868
37869 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37870 CallSiteNumToLPad[CSI].push_back(&MBB);
37871 MaxCSNum = std::max(MaxCSNum, CSI);
37872 }
37873 }
37874
37875 // Get an ordered list of the machine basic blocks for the jump table.
37876 std::vector<MachineBasicBlock *> LPadList;
37877 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37878 LPadList.reserve(CallSiteNumToLPad.size());
37879
37880 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37881 for (auto &LP : CallSiteNumToLPad[CSI]) {
37882 LPadList.push_back(LP);
37883 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37884 }
37885 }
37886
37887 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__
__PRETTY_FUNCTION__))
37888 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__
__PRETTY_FUNCTION__))
;
37889
37890 // Create the MBBs for the dispatch code.
37891
37892 // Shove the dispatch's address into the return slot in the function context.
37893 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37894 DispatchBB->setIsEHPad(true);
37895
37896 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37897 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
37898 DispatchBB->addSuccessor(TrapBB);
37899
37900 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37901 DispatchBB->addSuccessor(DispContBB);
37902
37903 // Insert MBBs.
37904 MF->push_back(DispatchBB);
37905 MF->push_back(DispContBB);
37906 MF->push_back(TrapBB);
37907
37908 // Insert code into the entry block that creates and registers the function
37909 // context.
37910 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37911
37912 // Create the jump table and associated information
37913 unsigned JTE = getJumpTableEncoding();
37914 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37915 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37916
37917 const X86RegisterInfo &RI = TII->getRegisterInfo();
37918 // Add a register mask with no preserved registers. This results in all
37919 // registers being marked as clobbered.
37920 if (RI.hasBasePointer(*MF)) {
37921 const bool FPIs64Bit =
37922 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37923 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37924 MFI->setRestoreBasePointer(MF);
37925
37926 Register FP = RI.getFrameRegister(*MF);
37927 Register BP = RI.getBaseRegister();
37928 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37929 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
37930 MFI->getRestoreBasePointerOffset())
37931 .addRegMask(RI.getNoPreservedMask());
37932 } else {
37933 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
37934 .addRegMask(RI.getNoPreservedMask());
37935 }
37936
37937 // IReg is used as an index in a memory operand and therefore can't be SP
37938 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37939 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
37940 Subtarget.is64Bit() ? 8 : 4);
37941 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
37942 .addReg(IReg)
37943 .addImm(LPadList.size());
37944 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
37945
37946 if (Subtarget.is64Bit()) {
37947 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37948 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37949
37950 // leaq .LJTI0_0(%rip), BReg
37951 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
37952 .addReg(X86::RIP)
37953 .addImm(1)
37954 .addReg(0)
37955 .addJumpTableIndex(MJTI)
37956 .addReg(0);
37957 // movzx IReg64, IReg
37958 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37959 .addImm(0)
37960 .addReg(IReg)
37961 .addImm(X86::sub_32bit);
37962
37963 switch (JTE) {
37964 case MachineJumpTableInfo::EK_BlockAddress:
37965 // jmpq *(BReg,IReg64,8)
37966 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
37967 .addReg(BReg)
37968 .addImm(8)
37969 .addReg(IReg64)
37970 .addImm(0)
37971 .addReg(0);
37972 break;
37973 case MachineJumpTableInfo::EK_LabelDifference32: {
37974 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37975 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37976 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37977
37978 // movl (BReg,IReg64,4), OReg
37979 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
37980 .addReg(BReg)
37981 .addImm(4)
37982 .addReg(IReg64)
37983 .addImm(0)
37984 .addReg(0);
37985 // movsx OReg64, OReg
37986 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
37987 // addq BReg, OReg64, TReg
37988 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
37989 .addReg(OReg64)
37990 .addReg(BReg);
37991 // jmpq *TReg
37992 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
37993 break;
37994 }
37995 default:
37996 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37996)
;
37997 }
37998 } else {
37999 // jmpl *.LJTI0_0(,IReg,4)
38000 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
38001 .addReg(0)
38002 .addImm(4)
38003 .addReg(IReg)
38004 .addJumpTableIndex(MJTI)
38005 .addReg(0);
38006 }
38007
38008 // Add the jump table entries as successors to the MBB.
38009 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
38010 for (auto &LP : LPadList)
38011 if (SeenMBBs.insert(LP).second)
38012 DispContBB->addSuccessor(LP);
38013
38014 // N.B. the order the invoke BBs are processed in doesn't matter here.
38015 SmallVector<MachineBasicBlock *, 64> MBBLPads;
38016 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
38017 for (MachineBasicBlock *MBB : InvokeBBs) {
38018 // Remove the landing pad successor from the invoke block and replace it
38019 // with the new dispatch block.
38020 // Keep a copy of Successors since it's modified inside the loop.
38021 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
38022 MBB->succ_rend());
38023 // FIXME: Avoid quadratic complexity.
38024 for (auto *MBBS : Successors) {
38025 if (MBBS->isEHPad()) {
38026 MBB->removeSuccessor(MBBS);
38027 MBBLPads.push_back(MBBS);
38028 }
38029 }
38030
38031 MBB->addSuccessor(DispatchBB);
38032
38033 // Find the invoke call and mark all of the callee-saved registers as
38034 // 'implicit defined' so that they're spilled. This prevents code from
38035 // moving instructions to before the EH block, where they will never be
38036 // executed.
38037 for (auto &II : reverse(*MBB)) {
38038 if (!II.isCall())
38039 continue;
38040
38041 DenseMap<unsigned, bool> DefRegs;
38042 for (auto &MOp : II.operands())
38043 if (MOp.isReg())
38044 DefRegs[MOp.getReg()] = true;
38045
38046 MachineInstrBuilder MIB(*MF, &II);
38047 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
38048 unsigned Reg = SavedRegs[RegIdx];
38049 if (!DefRegs[Reg])
38050 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
38051 }
38052
38053 break;
38054 }
38055 }
38056
38057 // Mark all former landing pads as non-landing pads. The dispatch is the only
38058 // landing pad now.
38059 for (auto &LP : MBBLPads)
38060 LP->setIsEHPad(false);
38061
38062 // The instruction is gone now.
38063 MI.eraseFromParent();
38064 return BB;
38065}
38066
38067MachineBasicBlock *
38068X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
38069 MachineBasicBlock *BB) const {
38070 MachineFunction *MF = BB->getParent();
38071 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38072 const DebugLoc &DL = MI.getDebugLoc();
38073
38074 auto TMMImmToTMMReg = [](unsigned Imm) {
38075 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38075, __extension__
__PRETTY_FUNCTION__))
;
38076 return X86::TMM0 + Imm;
38077 };
38078 switch (MI.getOpcode()) {
38079 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38079)
;
38080 case X86::TLS_addr32:
38081 case X86::TLS_addr64:
38082 case X86::TLS_addrX32:
38083 case X86::TLS_base_addr32:
38084 case X86::TLS_base_addr64:
38085 case X86::TLS_base_addrX32:
38086 return EmitLoweredTLSAddr(MI, BB);
38087 case X86::INDIRECT_THUNK_CALL32:
38088 case X86::INDIRECT_THUNK_CALL64:
38089 case X86::INDIRECT_THUNK_TCRETURN32:
38090 case X86::INDIRECT_THUNK_TCRETURN64:
38091 return EmitLoweredIndirectThunk(MI, BB);
38092 case X86::CATCHRET:
38093 return EmitLoweredCatchRet(MI, BB);
38094 case X86::SEG_ALLOCA_32:
38095 case X86::SEG_ALLOCA_64:
38096 return EmitLoweredSegAlloca(MI, BB);
38097 case X86::PROBED_ALLOCA_32:
38098 case X86::PROBED_ALLOCA_64:
38099 return EmitLoweredProbedAlloca(MI, BB);
38100 case X86::TLSCall_32:
38101 case X86::TLSCall_64:
38102 return EmitLoweredTLSCall(MI, BB);
38103 case X86::CMOV_FR16:
38104 case X86::CMOV_FR16X:
38105 case X86::CMOV_FR32:
38106 case X86::CMOV_FR32X:
38107 case X86::CMOV_FR64:
38108 case X86::CMOV_FR64X:
38109 case X86::CMOV_GR8:
38110 case X86::CMOV_GR16:
38111 case X86::CMOV_GR32:
38112 case X86::CMOV_RFP32:
38113 case X86::CMOV_RFP64:
38114 case X86::CMOV_RFP80:
38115 case X86::CMOV_VR64:
38116 case X86::CMOV_VR128:
38117 case X86::CMOV_VR128X:
38118 case X86::CMOV_VR256:
38119 case X86::CMOV_VR256X:
38120 case X86::CMOV_VR512:
38121 case X86::CMOV_VK1:
38122 case X86::CMOV_VK2:
38123 case X86::CMOV_VK4:
38124 case X86::CMOV_VK8:
38125 case X86::CMOV_VK16:
38126 case X86::CMOV_VK32:
38127 case X86::CMOV_VK64:
38128 return EmitLoweredSelect(MI, BB);
38129
38130 case X86::FP80_ADDr:
38131 case X86::FP80_ADDm32: {
38132 // Change the floating point control register to use double extended
38133 // precision when performing the addition.
38134 int OrigCWFrameIdx =
38135 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38136 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
38137 OrigCWFrameIdx);
38138
38139 // Load the old value of the control word...
38140 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38141 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
38142 OrigCWFrameIdx);
38143
38144 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
38145 // precision.
38146 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38147 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
38148 .addReg(OldCW, RegState::Kill)
38149 .addImm(0x300);
38150
38151 // Extract to 16 bits.
38152 Register NewCW16 =
38153 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
38154 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
38155 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
38156
38157 // Prepare memory for FLDCW.
38158 int NewCWFrameIdx =
38159 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38160 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
38161 NewCWFrameIdx)
38162 .addReg(NewCW16, RegState::Kill);
38163
38164 // Reload the modified control word now...
38165 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
38166 NewCWFrameIdx);
38167
38168 // Do the addition.
38169 if (MI.getOpcode() == X86::FP80_ADDr) {
38170 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
38171 .add(MI.getOperand(0))
38172 .add(MI.getOperand(1))
38173 .add(MI.getOperand(2));
38174 } else {
38175 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
38176 .add(MI.getOperand(0))
38177 .add(MI.getOperand(1))
38178 .add(MI.getOperand(2))
38179 .add(MI.getOperand(3))
38180 .add(MI.getOperand(4))
38181 .add(MI.getOperand(5))
38182 .add(MI.getOperand(6));
38183 }
38184
38185 // Reload the original control word now.
38186 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
38187 OrigCWFrameIdx);
38188
38189 MI.eraseFromParent(); // The pseudo instruction is gone now.
38190 return BB;
38191 }
38192
38193 case X86::FP32_TO_INT16_IN_MEM:
38194 case X86::FP32_TO_INT32_IN_MEM:
38195 case X86::FP32_TO_INT64_IN_MEM:
38196 case X86::FP64_TO_INT16_IN_MEM:
38197 case X86::FP64_TO_INT32_IN_MEM:
38198 case X86::FP64_TO_INT64_IN_MEM:
38199 case X86::FP80_TO_INT16_IN_MEM:
38200 case X86::FP80_TO_INT32_IN_MEM:
38201 case X86::FP80_TO_INT64_IN_MEM: {
38202 // Change the floating point control register to use "round towards zero"
38203 // mode when truncating to an integer value.
38204 int OrigCWFrameIdx =
38205 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38206 addFrameReference(BuildMI(*BB, MI, DL,
38207 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
38208
38209 // Load the old value of the control word...
38210 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38211 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
38212 OrigCWFrameIdx);
38213
38214 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
38215 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38216 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
38217 .addReg(OldCW, RegState::Kill).addImm(0xC00);
38218
38219 // Extract to 16 bits.
38220 Register NewCW16 =
38221 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
38222 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
38223 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
38224
38225 // Prepare memory for FLDCW.
38226 int NewCWFrameIdx =
38227 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38228 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
38229 NewCWFrameIdx)
38230 .addReg(NewCW16, RegState::Kill);
38231
38232 // Reload the modified control word now...
38233 addFrameReference(BuildMI(*BB, MI, DL,
38234 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
38235
38236 // Get the X86 opcode to use.
38237 unsigned Opc;
38238 switch (MI.getOpcode()) {
38239 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38239)
;
38240 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
38241 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
38242 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
38243 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
38244 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
38245 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
38246 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
38247 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
38248 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
38249 }
38250
38251 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38252 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
38253 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
38254
38255 // Reload the original control word now.
38256 addFrameReference(BuildMI(*BB, MI, DL,
38257 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
38258
38259 MI.eraseFromParent(); // The pseudo instruction is gone now.
38260 return BB;
38261 }
38262
38263 // xbegin
38264 case X86::XBEGIN:
38265 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
38266
38267 case X86::VAARG_64:
38268 case X86::VAARG_X32:
38269 return EmitVAARGWithCustomInserter(MI, BB);
38270
38271 case X86::EH_SjLj_SetJmp32:
38272 case X86::EH_SjLj_SetJmp64:
38273 return emitEHSjLjSetJmp(MI, BB);
38274
38275 case X86::EH_SjLj_LongJmp32:
38276 case X86::EH_SjLj_LongJmp64:
38277 return emitEHSjLjLongJmp(MI, BB);
38278
38279 case X86::Int_eh_sjlj_setup_dispatch:
38280 return EmitSjLjDispatchBlock(MI, BB);
38281
38282 case TargetOpcode::STATEPOINT:
38283 // As an implementation detail, STATEPOINT shares the STACKMAP format at
38284 // this point in the process. We diverge later.
38285 return emitPatchPoint(MI, BB);
38286
38287 case TargetOpcode::STACKMAP:
38288 case TargetOpcode::PATCHPOINT:
38289 return emitPatchPoint(MI, BB);
38290
38291 case TargetOpcode::PATCHABLE_EVENT_CALL:
38292 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
38293 return BB;
38294
38295 case X86::LCMPXCHG8B: {
38296 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38297 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
38298 // requires a memory operand. If it happens that current architecture is
38299 // i686 and for current function we need a base pointer
38300 // - which is ESI for i686 - register allocator would not be able to
38301 // allocate registers for an address in form of X(%reg, %reg, Y)
38302 // - there never would be enough unreserved registers during regalloc
38303 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38304 // We are giving a hand to register allocator by precomputing the address in
38305 // a new vreg using LEA.
38306
38307 // If it is not i686 or there is no base pointer - nothing to do here.
38308 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38309 return BB;
38310
38311 // Even though this code does not necessarily needs the base pointer to
38312 // be ESI, we check for that. The reason: if this assert fails, there are
38313 // some changes happened in the compiler base pointer handling, which most
38314 // probably have to be addressed somehow here.
38315 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))
38316 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))
38317 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))
;
38318
38319 MachineRegisterInfo &MRI = MF->getRegInfo();
38320 MVT SPTy = getPointerTy(MF->getDataLayout());
38321 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38322 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38323
38324 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38325 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38326 // does not use index register.
38327 if (AM.IndexReg == X86::NoRegister)
38328 return BB;
38329
38330 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38331 // four operand definitions that are E[ABCD] registers. We skip them and
38332 // then insert the LEA.
38333 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38334 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
38335 RMBBI->definesRegister(X86::EBX) ||
38336 RMBBI->definesRegister(X86::ECX) ||
38337 RMBBI->definesRegister(X86::EDX))) {
38338 ++RMBBI;
38339 }
38340 MachineBasicBlock::iterator MBBI(RMBBI);
38341 addFullAddress(
38342 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
38343
38344 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38345
38346 return BB;
38347 }
38348 case X86::LCMPXCHG16B_NO_RBX: {
38349 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38350 Register BasePtr = TRI->getBaseRegister();
38351 if (TRI->hasBasePointer(*MF) &&
38352 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38353 if (!BB->isLiveIn(BasePtr))
38354 BB->addLiveIn(BasePtr);
38355 // Save RBX into a virtual register.
38356 Register SaveRBX =
38357 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38358 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38359 .addReg(X86::RBX);
38360 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38361 MachineInstrBuilder MIB =
38362 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38363 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38364 MIB.add(MI.getOperand(Idx));
38365 MIB.add(MI.getOperand(X86::AddrNumOperands));
38366 MIB.addReg(SaveRBX);
38367 } else {
38368 // Simple case, just copy the virtual register to RBX.
38369 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
38370 .add(MI.getOperand(X86::AddrNumOperands));
38371 MachineInstrBuilder MIB =
38372 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
38373 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38374 MIB.add(MI.getOperand(Idx));
38375 }
38376 MI.eraseFromParent();
38377 return BB;
38378 }
38379 case X86::MWAITX: {
38380 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38381 Register BasePtr = TRI->getBaseRegister();
38382 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38383 // If no need to save the base pointer, we generate MWAITXrrr,
38384 // else we generate pseudo MWAITX_SAVE_RBX.
38385 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38386 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38387 .addReg(MI.getOperand(0).getReg());
38388 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38389 .addReg(MI.getOperand(1).getReg());
38390 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
38391 .addReg(MI.getOperand(2).getReg());
38392 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
38393 MI.eraseFromParent();
38394 } else {
38395 if (!BB->isLiveIn(BasePtr)) {
38396 BB->addLiveIn(BasePtr);
38397 }
38398 // Parameters can be copied into ECX and EAX but not EBX yet.
38399 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38400 .addReg(MI.getOperand(0).getReg());
38401 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38402 .addReg(MI.getOperand(1).getReg());
38403 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38403, __extension__
__PRETTY_FUNCTION__))
;
38404 // Save RBX into a virtual register.
38405 Register SaveRBX =
38406 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38407 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38408 .addReg(X86::RBX);
38409 // Generate mwaitx pseudo.
38410 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38411 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
38412 .addDef(Dst) // Destination tied in with SaveRBX.
38413 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38414 .addUse(SaveRBX); // Save of base pointer.
38415 MI.eraseFromParent();
38416 }
38417 return BB;
38418 }
38419 case TargetOpcode::PREALLOCATED_SETUP: {
38420 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38420, __extension__
__PRETTY_FUNCTION__))
;
38421 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38422 MFI->setHasPreallocatedCall(true);
38423 int64_t PreallocatedId = MI.getOperand(0).getImm();
38424 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38425 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38425, __extension__
__PRETTY_FUNCTION__))
;
38426 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
38427 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
38428 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
38429 .addReg(X86::ESP)
38430 .addImm(StackAdjustment);
38431 MI.eraseFromParent();
38432 return BB;
38433 }
38434 case TargetOpcode::PREALLOCATED_ARG: {
38435 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38435, __extension__
__PRETTY_FUNCTION__))
;
38436 int64_t PreallocatedId = MI.getOperand(1).getImm();
38437 int64_t ArgIdx = MI.getOperand(2).getImm();
38438 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38439 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38440 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
38441 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
38442 // stack pointer + offset
38443 addRegOffset(
38444 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
38445 X86::ESP, false, ArgOffset);
38446 MI.eraseFromParent();
38447 return BB;
38448 }
38449 case X86::PTDPBSSD:
38450 case X86::PTDPBSUD:
38451 case X86::PTDPBUSD:
38452 case X86::PTDPBUUD:
38453 case X86::PTDPBF16PS:
38454 case X86::PTDPFP16PS: {
38455 unsigned Opc;
38456 switch (MI.getOpcode()) {
38457 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38457)
;
38458 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38459 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38460 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38461 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38462 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38463 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38464 }
38465
38466 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38467 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38468 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38469 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38470 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38471
38472 MI.eraseFromParent(); // The pseudo is gone now.
38473 return BB;
38474 }
38475 case X86::PTILEZERO: {
38476 unsigned Imm = MI.getOperand(0).getImm();
38477 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38478 MI.eraseFromParent(); // The pseudo is gone now.
38479 return BB;
38480 }
38481 case X86::PTILELOADD:
38482 case X86::PTILELOADDT1:
38483 case X86::PTILESTORED: {
38484 unsigned Opc;
38485 switch (MI.getOpcode()) {
38486 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38486)
;
38487 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
38488 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
38489 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
38490 }
38491
38492 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38493 unsigned CurOp = 0;
38494 if (Opc != X86::TILESTORED)
38495 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38496 RegState::Define);
38497
38498 MIB.add(MI.getOperand(CurOp++)); // base
38499 MIB.add(MI.getOperand(CurOp++)); // scale
38500 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38501 MIB.add(MI.getOperand(CurOp++)); // displacement
38502 MIB.add(MI.getOperand(CurOp++)); // segment
38503
38504 if (Opc == X86::TILESTORED)
38505 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38506 RegState::Undef);
38507
38508 MI.eraseFromParent(); // The pseudo is gone now.
38509 return BB;
38510 }
38511 case X86::PTCMMIMFP16PS:
38512 case X86::PTCMMRLFP16PS: {
38513 const DebugLoc &DL = MI.getDebugLoc();
38514 unsigned Opc;
38515 switch (MI.getOpcode()) {
38516 default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38516)
;
38517 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
38518 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
38519 }
38520 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38521 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38522 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38523 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38524 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38525 MI.eraseFromParent(); // The pseudo is gone now.
38526 return BB;
38527 }
38528 }
38529}
38530
38531//===----------------------------------------------------------------------===//
38532// X86 Optimization Hooks
38533//===----------------------------------------------------------------------===//
38534
38535bool
38536X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
38537 const APInt &DemandedBits,
38538 const APInt &DemandedElts,
38539 TargetLoweringOpt &TLO) const {
38540 EVT VT = Op.getValueType();
38541 unsigned Opcode = Op.getOpcode();
38542 unsigned EltSize = VT.getScalarSizeInBits();
38543
38544 if (VT.isVector()) {
38545 // If the constant is only all signbits in the active bits, then we should
38546 // extend it to the entire constant to allow it act as a boolean constant
38547 // vector.
38548 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38549 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38550 return false;
38551 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38552 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38553 continue;
38554 const APInt &Val = V.getConstantOperandAPInt(i);
38555 if (Val.getBitWidth() > Val.getNumSignBits() &&
38556 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38557 return true;
38558 }
38559 return false;
38560 };
38561 // For vectors - if we have a constant, then try to sign extend.
38562 // TODO: Handle AND/ANDN cases.
38563 unsigned ActiveBits = DemandedBits.getActiveBits();
38564 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38565 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
38566 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38567 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38568 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38569 VT.getVectorNumElements());
38570 SDValue NewC =
38571 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
38572 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38573 SDValue NewOp =
38574 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38575 return TLO.CombineTo(Op, NewOp);
38576 }
38577 return false;
38578 }
38579
38580 // Only optimize Ands to prevent shrinking a constant that could be
38581 // matched by movzx.
38582 if (Opcode != ISD::AND)
38583 return false;
38584
38585 // Make sure the RHS really is a constant.
38586 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38587 if (!C)
38588 return false;
38589
38590 const APInt &Mask = C->getAPIntValue();
38591
38592 // Clear all non-demanded bits initially.
38593 APInt ShrunkMask = Mask & DemandedBits;
38594
38595 // Find the width of the shrunk mask.
38596 unsigned Width = ShrunkMask.getActiveBits();
38597
38598 // If the mask is all 0s there's nothing to do here.
38599 if (Width == 0)
38600 return false;
38601
38602 // Find the next power of 2 width, rounding up to a byte.
38603 Width = llvm::bit_ceil(std::max(Width, 8U));
38604 // Truncate the width to size to handle illegal types.
38605 Width = std::min(Width, EltSize);
38606
38607 // Calculate a possible zero extend mask for this constant.
38608 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38609
38610 // If we aren't changing the mask, just return true to keep it and prevent
38611 // the caller from optimizing.
38612 if (ZeroExtendMask == Mask)
38613 return true;
38614
38615 // Make sure the new mask can be represented by a combination of mask bits
38616 // and non-demanded bits.
38617 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38618 return false;
38619
38620 // Replace the constant with the zero extend mask.
38621 SDLoc DL(Op);
38622 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38623 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38624 return TLO.CombineTo(Op, NewOp);
38625}
38626
38627void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
38628 KnownBits &Known,
38629 const APInt &DemandedElts,
38630 const SelectionDAG &DAG,
38631 unsigned Depth) const {
38632 unsigned BitWidth = Known.getBitWidth();
38633 unsigned NumElts = DemandedElts.getBitWidth();
38634 unsigned Opc = Op.getOpcode();
38635 EVT VT = Op.getValueType();
38636 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38637 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38638 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38639 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38640 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38641 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
;
38642
38643 Known.resetAll();
38644 switch (Opc) {
38645 default: break;
38646 case X86ISD::MUL_IMM: {
38647 KnownBits Known2;
38648 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38649 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38650 Known = KnownBits::mul(Known, Known2);
38651 break;
38652 }
38653 case X86ISD::SETCC:
38654 Known.Zero.setBitsFrom(1);
38655 break;
38656 case X86ISD::MOVMSK: {
38657 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38658 Known.Zero.setBitsFrom(NumLoBits);
38659 break;
38660 }
38661 case X86ISD::PEXTRB:
38662 case X86ISD::PEXTRW: {
38663 SDValue Src = Op.getOperand(0);
38664 EVT SrcVT = Src.getValueType();
38665 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38666 Op.getConstantOperandVal(1));
38667 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38668 Known = Known.anyextOrTrunc(BitWidth);
38669 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38670 break;
38671 }
38672 case X86ISD::VSRAI:
38673 case X86ISD::VSHLI:
38674 case X86ISD::VSRLI: {
38675 unsigned ShAmt = Op.getConstantOperandVal(1);
38676 if (ShAmt >= VT.getScalarSizeInBits()) {
38677 // Out of range logical bit shifts are guaranteed to be zero.
38678 // Out of range arithmetic bit shifts splat the sign bit.
38679 if (Opc != X86ISD::VSRAI) {
38680 Known.setAllZero();
38681 break;
38682 }
38683
38684 ShAmt = VT.getScalarSizeInBits() - 1;
38685 }
38686
38687 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38688 if (Opc == X86ISD::VSHLI) {
38689 Known.Zero <<= ShAmt;
38690 Known.One <<= ShAmt;
38691 // Low bits are known zero.
38692 Known.Zero.setLowBits(ShAmt);
38693 } else if (Opc == X86ISD::VSRLI) {
38694 Known.Zero.lshrInPlace(ShAmt);
38695 Known.One.lshrInPlace(ShAmt);
38696 // High bits are known zero.
38697 Known.Zero.setHighBits(ShAmt);
38698 } else {
38699 Known.Zero.ashrInPlace(ShAmt);
38700 Known.One.ashrInPlace(ShAmt);
38701 }
38702 break;
38703 }
38704 case X86ISD::PACKUS: {
38705 // PACKUS is just a truncation if the upper half is zero.
38706 APInt DemandedLHS, DemandedRHS;
38707 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38708
38709 Known.One = APInt::getAllOnes(BitWidth * 2);
38710 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38711
38712 KnownBits Known2;
38713 if (!!DemandedLHS) {
38714 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38715 Known = KnownBits::commonBits(Known, Known2);
38716 }
38717 if (!!DemandedRHS) {
38718 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38719 Known = KnownBits::commonBits(Known, Known2);
38720 }
38721
38722 if (Known.countMinLeadingZeros() < BitWidth)
38723 Known.resetAll();
38724 Known = Known.trunc(BitWidth);
38725 break;
38726 }
38727 case X86ISD::VBROADCAST: {
38728 SDValue Src = Op.getOperand(0);
38729 if (!Src.getSimpleValueType().isVector()) {
38730 Known = DAG.computeKnownBits(Src, Depth + 1);
38731 return;
38732 }
38733 break;
38734 }
38735 case X86ISD::AND: {
38736 if (Op.getResNo() == 0) {
38737 KnownBits Known2;
38738 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38739 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38740 Known &= Known2;
38741 }
38742 break;
38743 }
38744 case X86ISD::ANDNP: {
38745 KnownBits Known2;
38746 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38747 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38748
38749 // ANDNP = (~X & Y);
38750 Known.One &= Known2.Zero;
38751 Known.Zero |= Known2.One;
38752 break;
38753 }
38754 case X86ISD::FOR: {
38755 KnownBits Known2;
38756 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38757 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38758
38759 Known |= Known2;
38760 break;
38761 }
38762 case X86ISD::PSADBW: {
38763 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))
38764 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))
38765 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))
;
38766
38767 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
38768 Known.Zero.setBitsFrom(16);
38769 break;
38770 }
38771 case X86ISD::PCMPGT:
38772 case X86ISD::PCMPEQ: {
38773 KnownBits KnownLhs =
38774 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38775 KnownBits KnownRhs =
38776 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38777 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38778 ? KnownBits::eq(KnownLhs, KnownRhs)
38779 : KnownBits::sgt(KnownLhs, KnownRhs);
38780 if (Res) {
38781 if (*Res)
38782 Known.setAllOnes();
38783 else
38784 Known.setAllZero();
38785 }
38786 break;
38787 }
38788 case X86ISD::PMULUDQ: {
38789 KnownBits Known2;
38790 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38791 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38792
38793 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38794 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38795 Known = KnownBits::mul(Known, Known2);
38796 break;
38797 }
38798 case X86ISD::CMOV: {
38799 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38800 // If we don't know any bits, early out.
38801 if (Known.isUnknown())
38802 break;
38803 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38804
38805 // Only known if known in both the LHS and RHS.
38806 Known = KnownBits::commonBits(Known, Known2);
38807 break;
38808 }
38809 case X86ISD::BEXTR:
38810 case X86ISD::BEXTRI: {
38811 SDValue Op0 = Op.getOperand(0);
38812 SDValue Op1 = Op.getOperand(1);
38813
38814 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38815 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38816 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38817
38818 // If the length is 0, the result is 0.
38819 if (Length == 0) {
38820 Known.setAllZero();
38821 break;
38822 }
38823
38824 if ((Shift + Length) <= BitWidth) {
38825 Known = DAG.computeKnownBits(Op0, Depth + 1);
38826 Known = Known.extractBits(Length, Shift);
38827 Known = Known.zextOrTrunc(BitWidth);
38828 }
38829 }
38830 break;
38831 }
38832 case X86ISD::PDEP: {
38833 KnownBits Known2;
38834 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38835 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38836 // Zeros are retained from the mask operand. But not ones.
38837 Known.One.clearAllBits();
38838 // The result will have at least as many trailing zeros as the non-mask
38839 // operand since bits can only map to the same or higher bit position.
38840 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38841 break;
38842 }
38843 case X86ISD::PEXT: {
38844 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38845 // The result has as many leading zeros as the number of zeroes in the mask.
38846 unsigned Count = Known.Zero.popcount();
38847 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38848 Known.One.clearAllBits();
38849 break;
38850 }
38851 case X86ISD::VTRUNC:
38852 case X86ISD::VTRUNCS:
38853 case X86ISD::VTRUNCUS:
38854 case X86ISD::CVTSI2P:
38855 case X86ISD::CVTUI2P:
38856 case X86ISD::CVTP2SI:
38857 case X86ISD::CVTP2UI:
38858 case X86ISD::MCVTP2SI:
38859 case X86ISD::MCVTP2UI:
38860 case X86ISD::CVTTP2SI:
38861 case X86ISD::CVTTP2UI:
38862 case X86ISD::MCVTTP2SI:
38863 case X86ISD::MCVTTP2UI:
38864 case X86ISD::MCVTSI2P:
38865 case X86ISD::MCVTUI2P:
38866 case X86ISD::VFPROUND:
38867 case X86ISD::VMFPROUND:
38868 case X86ISD::CVTPS2PH:
38869 case X86ISD::MCVTPS2PH: {
38870 // Truncations/Conversions - upper elements are known zero.
38871 EVT SrcVT = Op.getOperand(0).getValueType();
38872 if (SrcVT.isVector()) {
38873 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38874 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38875 Known.setAllZero();
38876 }
38877 break;
38878 }
38879 case X86ISD::STRICT_CVTTP2SI:
38880 case X86ISD::STRICT_CVTTP2UI:
38881 case X86ISD::STRICT_CVTSI2P:
38882 case X86ISD::STRICT_CVTUI2P:
38883 case X86ISD::STRICT_VFPROUND:
38884 case X86ISD::STRICT_CVTPS2PH: {
38885 // Strict Conversions - upper elements are known zero.
38886 EVT SrcVT = Op.getOperand(1).getValueType();
38887 if (SrcVT.isVector()) {
38888 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38889 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38890 Known.setAllZero();
38891 }
38892 break;
38893 }
38894 case X86ISD::MOVQ2DQ: {
38895 // Move from MMX to XMM. Upper half of XMM should be 0.
38896 if (DemandedElts.countr_zero() >= (NumElts / 2))
38897 Known.setAllZero();
38898 break;
38899 }
38900 case X86ISD::VBROADCAST_LOAD: {
38901 APInt UndefElts;
38902 SmallVector<APInt, 16> EltBits;
38903 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38904 /*AllowWholeUndefs*/ false,
38905 /*AllowPartialUndefs*/ false)) {
38906 Known.Zero.setAllBits();
38907 Known.One.setAllBits();
38908 for (unsigned I = 0; I != NumElts; ++I) {
38909 if (!DemandedElts[I])
38910 continue;
38911 if (UndefElts[I]) {
38912 Known.resetAll();
38913 break;
38914 }
38915 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38916 Known = KnownBits::commonBits(Known, Known2);
38917 }
38918 return;
38919 }
38920 break;
38921 }
38922 }
38923
38924 // Handle target shuffles.
38925 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38926 if (isTargetShuffle(Opc)) {
38927 SmallVector<int, 64> Mask;
38928 SmallVector<SDValue, 2> Ops;
38929 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38930 unsigned NumOps = Ops.size();
38931 unsigned NumElts = VT.getVectorNumElements();
38932 if (Mask.size() == NumElts) {
38933 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38934 Known.Zero.setAllBits(); Known.One.setAllBits();
38935 for (unsigned i = 0; i != NumElts; ++i) {
38936 if (!DemandedElts[i])
38937 continue;
38938 int M = Mask[i];
38939 if (M == SM_SentinelUndef) {
38940 // For UNDEF elements, we don't know anything about the common state
38941 // of the shuffle result.
38942 Known.resetAll();
38943 break;
38944 }
38945 if (M == SM_SentinelZero) {
38946 Known.One.clearAllBits();
38947 continue;
38948 }
38949 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__
__PRETTY_FUNCTION__))
38950 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__
__PRETTY_FUNCTION__))
;
38951
38952 unsigned OpIdx = (unsigned)M / NumElts;
38953 unsigned EltIdx = (unsigned)M % NumElts;
38954 if (Ops[OpIdx].getValueType() != VT) {
38955 // TODO - handle target shuffle ops with different value types.
38956 Known.resetAll();
38957 break;
38958 }
38959 DemandedOps[OpIdx].setBit(EltIdx);
38960 }
38961 // Known bits are the values that are shared by every demanded element.
38962 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38963 if (!DemandedOps[i])
38964 continue;
38965 KnownBits Known2 =
38966 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38967 Known = KnownBits::commonBits(Known, Known2);
38968 }
38969 }
38970 }
38971 }
38972}
38973
38974unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
38975 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38976 unsigned Depth) const {
38977 EVT VT = Op.getValueType();
38978 unsigned VTBits = VT.getScalarSizeInBits();
38979 unsigned Opcode = Op.getOpcode();
38980 switch (Opcode) {
38981 case X86ISD::SETCC_CARRY:
38982 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38983 return VTBits;
38984
38985 case X86ISD::VTRUNC: {
38986 SDValue Src = Op.getOperand(0);
38987 MVT SrcVT = Src.getSimpleValueType();
38988 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38989 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38989, __extension__
__PRETTY_FUNCTION__))
;
38990 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38991 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38992 if (Tmp > (NumSrcBits - VTBits))
38993 return Tmp - (NumSrcBits - VTBits);
38994 return 1;
38995 }
38996
38997 case X86ISD::PACKSS: {
38998 // PACKSS is just a truncation if the sign bits extend to the packed size.
38999 APInt DemandedLHS, DemandedRHS;
39000 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39001 DemandedRHS);
39002
39003 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39004 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39005 if (!!DemandedLHS)
39006 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
39007 if (!!DemandedRHS)
39008 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
39009 unsigned Tmp = std::min(Tmp0, Tmp1);
39010 if (Tmp > (SrcBits - VTBits))
39011 return Tmp - (SrcBits - VTBits);
39012 return 1;
39013 }
39014
39015 case X86ISD::VBROADCAST: {
39016 SDValue Src = Op.getOperand(0);
39017 if (!Src.getSimpleValueType().isVector())
39018 return DAG.ComputeNumSignBits(Src, Depth + 1);
39019 break;
39020 }
39021
39022 case X86ISD::VSHLI: {
39023 SDValue Src = Op.getOperand(0);
39024 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39025 if (ShiftVal.uge(VTBits))
39026 return VTBits; // Shifted all bits out --> zero.
39027 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39028 if (ShiftVal.uge(Tmp))
39029 return 1; // Shifted all sign bits out --> unknown.
39030 return Tmp - ShiftVal.getZExtValue();
39031 }
39032
39033 case X86ISD::VSRAI: {
39034 SDValue Src = Op.getOperand(0);
39035 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39036 if (ShiftVal.uge(VTBits - 1))
39037 return VTBits; // Sign splat.
39038 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39039 ShiftVal += Tmp;
39040 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39041 }
39042
39043 case X86ISD::FSETCC:
39044 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39045 if (VT == MVT::f32 || VT == MVT::f64 ||
39046 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39047 return VTBits;
39048 break;
39049
39050 case X86ISD::PCMPGT:
39051 case X86ISD::PCMPEQ:
39052 case X86ISD::CMPP:
39053 case X86ISD::VPCOM:
39054 case X86ISD::VPCOMU:
39055 // Vector compares return zero/all-bits result values.
39056 return VTBits;
39057
39058 case X86ISD::ANDNP: {
39059 unsigned Tmp0 =
39060 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39061 if (Tmp0 == 1) return 1; // Early out.
39062 unsigned Tmp1 =
39063 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39064 return std::min(Tmp0, Tmp1);
39065 }
39066
39067 case X86ISD::CMOV: {
39068 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39069 if (Tmp0 == 1) return 1; // Early out.
39070 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39071 return std::min(Tmp0, Tmp1);
39072 }
39073 }
39074
39075 // Handle target shuffles.
39076 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39077 if (isTargetShuffle(Opcode)) {
39078 SmallVector<int, 64> Mask;
39079 SmallVector<SDValue, 2> Ops;
39080 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
39081 unsigned NumOps = Ops.size();
39082 unsigned NumElts = VT.getVectorNumElements();
39083 if (Mask.size() == NumElts) {
39084 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39085 for (unsigned i = 0; i != NumElts; ++i) {
39086 if (!DemandedElts[i])
39087 continue;
39088 int M = Mask[i];
39089 if (M == SM_SentinelUndef) {
39090 // For UNDEF elements, we don't know anything about the common state
39091 // of the shuffle result.
39092 return 1;
39093 } else if (M == SM_SentinelZero) {
39094 // Zero = all sign bits.
39095 continue;
39096 }
39097 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__
__PRETTY_FUNCTION__))
39098 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__
__PRETTY_FUNCTION__))
;
39099
39100 unsigned OpIdx = (unsigned)M / NumElts;
39101 unsigned EltIdx = (unsigned)M % NumElts;
39102 if (Ops[OpIdx].getValueType() != VT) {
39103 // TODO - handle target shuffle ops with different value types.
39104 return 1;
39105 }
39106 DemandedOps[OpIdx].setBit(EltIdx);
39107 }
39108 unsigned Tmp0 = VTBits;
39109 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39110 if (!DemandedOps[i])
39111 continue;
39112 unsigned Tmp1 =
39113 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39114 Tmp0 = std::min(Tmp0, Tmp1);
39115 }
39116 return Tmp0;
39117 }
39118 }
39119 }
39120
39121 // Fallback case.
39122 return 1;
39123}
39124
39125SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
39126 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39127 return N->getOperand(0);
39128 return N;
39129}
39130
39131// Helper to look for a normal load that can be narrowed into a vzload with the
39132// specified VT and memory VT. Returns SDValue() on failure.
39133static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
39134 SelectionDAG &DAG) {
39135 // Can't if the load is volatile or atomic.
39136 if (!LN->isSimple())
39137 return SDValue();
39138
39139 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39140 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39141 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39142 LN->getPointerInfo(), LN->getOriginalAlign(),
39143 LN->getMemOperand()->getFlags());
39144}
39145
39146// Attempt to match a combined shuffle mask against supported unary shuffle
39147// instructions.
39148// TODO: Investigate sharing more of this with shuffle lowering.
39149static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39150 bool AllowFloatDomain, bool AllowIntDomain,
39151 SDValue V1, const SelectionDAG &DAG,
39152 const X86Subtarget &Subtarget, unsigned &Shuffle,
39153 MVT &SrcVT, MVT &DstVT) {
39154 unsigned NumMaskElts = Mask.size();
39155 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39156
39157 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39158 if (Mask[0] == 0 &&
39159 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39160 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39161 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39162 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39163 Shuffle = X86ISD::VZEXT_MOVL;
39164 if (MaskEltSize == 16)
39165 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39166 else
39167 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39168 return true;
39169 }
39170 }
39171
39172 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
39173 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
39174 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39175 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
39176 unsigned MaxScale = 64 / MaskEltSize;
39177 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39178 bool MatchAny = true;
39179 bool MatchZero = true;
39180 unsigned NumDstElts = NumMaskElts / Scale;
39181 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
39182 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39183 MatchAny = MatchZero = false;
39184 break;
39185 }
39186 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
39187 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
39188 }
39189 if (MatchAny || MatchZero) {
39190 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39190, __extension__
__PRETTY_FUNCTION__))
;
39191 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39192 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
39193 MVT::getIntegerVT(MaskEltSize);
39194 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39195
39196 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
39197 if (SrcVT.getVectorNumElements() != NumDstElts)
39198 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39199
39200 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39201 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39202 return true;
39203 }
39204 }
39205 }
39206
39207 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39208 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39209 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39210 isUndefOrEqual(Mask[0], 0) &&
39211 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39212 Shuffle = X86ISD::VZEXT_MOVL;
39213 if (MaskEltSize == 16)
39214 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39215 else
39216 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39217 return true;
39218 }
39219
39220 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39221 // instructions are no slower than UNPCKLPD but has the option to
39222 // fold the input operand into even an unaligned memory load.
39223 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39224 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39225 Shuffle = X86ISD::MOVDDUP;
39226 SrcVT = DstVT = MVT::v2f64;
39227 return true;
39228 }
39229 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39230 Shuffle = X86ISD::MOVSLDUP;
39231 SrcVT = DstVT = MVT::v4f32;
39232 return true;
39233 }
39234 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39235 Shuffle = X86ISD::MOVSHDUP;
39236 SrcVT = DstVT = MVT::v4f32;
39237 return true;
39238 }
39239 }
39240
39241 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39242 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39242, __extension__
__PRETTY_FUNCTION__))
;
39243 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39244 Shuffle = X86ISD::MOVDDUP;
39245 SrcVT = DstVT = MVT::v4f64;
39246 return true;
39247 }
39248 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39249 V1)) {
39250 Shuffle = X86ISD::MOVSLDUP;
39251 SrcVT = DstVT = MVT::v8f32;
39252 return true;
39253 }
39254 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39255 V1)) {
39256 Shuffle = X86ISD::MOVSHDUP;
39257 SrcVT = DstVT = MVT::v8f32;
39258 return true;
39259 }
39260 }
39261
39262 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39263 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__
__PRETTY_FUNCTION__))
39264 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__
__PRETTY_FUNCTION__))
;
39265 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39266 V1)) {
39267 Shuffle = X86ISD::MOVDDUP;
39268 SrcVT = DstVT = MVT::v8f64;
39269 return true;
39270 }
39271 if (isTargetShuffleEquivalent(
39272 MaskVT, Mask,
39273 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39274 Shuffle = X86ISD::MOVSLDUP;
39275 SrcVT = DstVT = MVT::v16f32;
39276 return true;
39277 }
39278 if (isTargetShuffleEquivalent(
39279 MaskVT, Mask,
39280 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39281 Shuffle = X86ISD::MOVSHDUP;
39282 SrcVT = DstVT = MVT::v16f32;
39283 return true;
39284 }
39285 }
39286
39287 return false;
39288}
39289
39290// Attempt to match a combined shuffle mask against supported unary immediate
39291// permute instructions.
39292// TODO: Investigate sharing more of this with shuffle lowering.
39293static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
39294 const APInt &Zeroable,
39295 bool AllowFloatDomain, bool AllowIntDomain,
39296 const SelectionDAG &DAG,
39297 const X86Subtarget &Subtarget,
39298 unsigned &Shuffle, MVT &ShuffleVT,
39299 unsigned &PermuteImm) {
39300 unsigned NumMaskElts = Mask.size();
39301 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39302 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39303 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39304 bool ContainsZeros = isAnyZero(Mask);
39305
39306 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39307 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39308 // Check for lane crossing permutes.
39309 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39310 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39311 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39312 Shuffle = X86ISD::VPERMI;
39313 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39314 PermuteImm = getV4X86ShuffleImm(Mask);
39315 return true;
39316 }
39317 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39318 SmallVector<int, 4> RepeatedMask;
39319 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39320 Shuffle = X86ISD::VPERMI;
39321 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39322 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39323 return true;
39324 }
39325 }
39326 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39327 // VPERMILPD can permute with a non-repeating shuffle.
39328 Shuffle = X86ISD::VPERMILPI;
39329 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39330 PermuteImm = 0;
39331 for (int i = 0, e = Mask.size(); i != e; ++i) {
39332 int M = Mask[i];
39333 if (M == SM_SentinelUndef)
39334 continue;
39335 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39335, __extension__
__PRETTY_FUNCTION__))
;
39336 PermuteImm |= (M & 1) << i;
39337 }
39338 return true;
39339 }
39340 }
39341
39342 // We are checking for shuffle match or shift match. Loop twice so we can
39343 // order which we try and match first depending on target preference.
39344 for (unsigned Order = 0; Order < 2; ++Order) {
39345 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39346 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39347 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39348 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39349 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39350 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39351 SmallVector<int, 4> RepeatedMask;
39352 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39353 // Narrow the repeated mask to create 32-bit element permutes.
39354 SmallVector<int, 4> WordMask = RepeatedMask;
39355 if (MaskScalarSizeInBits == 64)
39356 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39357
39358 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39359 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39360 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39361 PermuteImm = getV4X86ShuffleImm(WordMask);
39362 return true;
39363 }
39364 }
39365
39366 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39367 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39368 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39369 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39370 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39371 SmallVector<int, 4> RepeatedMask;
39372 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39373 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39374 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39375
39376 // PSHUFLW: permute lower 4 elements only.
39377 if (isUndefOrInRange(LoMask, 0, 4) &&
39378 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39379 Shuffle = X86ISD::PSHUFLW;
39380 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39381 PermuteImm = getV4X86ShuffleImm(LoMask);
39382 return true;
39383 }
39384
39385 // PSHUFHW: permute upper 4 elements only.
39386 if (isUndefOrInRange(HiMask, 4, 8) &&
39387 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39388 // Offset the HiMask so that we can create the shuffle immediate.
39389 int OffsetHiMask[4];
39390 for (int i = 0; i != 4; ++i)
39391 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39392
39393 Shuffle = X86ISD::PSHUFHW;
39394 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39395 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39396 return true;
39397 }
39398 }
39399 }
39400 } else {
39401 // Attempt to match against bit rotates.
39402 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39403 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39404 Subtarget.hasAVX512())) {
39405 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39406 Subtarget, Mask);
39407 if (0 < RotateAmt) {
39408 Shuffle = X86ISD::VROTLI;
39409 PermuteImm = (unsigned)RotateAmt;
39410 return true;
39411 }
39412 }
39413 }
39414 // Attempt to match against byte/bit shifts.
39415 if (AllowIntDomain &&
39416 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39417 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39418 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39419 int ShiftAmt =
39420 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39421 Zeroable, Subtarget);
39422 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39423 32 <= ShuffleVT.getScalarSizeInBits())) {
39424 // Byte shifts can be slower so only match them on second attempt.
39425 if (Order == 0 &&
39426 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39427 continue;
39428
39429 PermuteImm = (unsigned)ShiftAmt;
39430 return true;
39431 }
39432
39433 }
39434 }
39435
39436 return false;
39437}
39438
39439// Attempt to match a combined unary shuffle mask against supported binary
39440// shuffle instructions.
39441// TODO: Investigate sharing more of this with shuffle lowering.
39442static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39443 bool AllowFloatDomain, bool AllowIntDomain,
39444 SDValue &V1, SDValue &V2, const SDLoc &DL,
39445 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39446 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39447 bool IsUnary) {
39448 unsigned NumMaskElts = Mask.size();
39449 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39450 unsigned SizeInBits = MaskVT.getSizeInBits();
39451
39452 if (MaskVT.is128BitVector()) {
39453 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39454 AllowFloatDomain) {
39455 V2 = V1;
39456 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39457 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39458 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39459 return true;
39460 }
39461 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39462 AllowFloatDomain) {
39463 V2 = V1;
39464 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39465 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39466 return true;
39467 }
39468 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39469 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39470 std::swap(V1, V2);
39471 Shuffle = X86ISD::MOVSD;
39472 SrcVT = DstVT = MVT::v2f64;
39473 return true;
39474 }
39475 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39476 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39477 Shuffle = X86ISD::MOVSS;
39478 SrcVT = DstVT = MVT::v4f32;
39479 return true;
39480 }
39481 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39482 DAG) &&
39483 Subtarget.hasFP16()) {
39484 Shuffle = X86ISD::MOVSH;
39485 SrcVT = DstVT = MVT::v8f16;
39486 return true;
39487 }
39488 }
39489
39490 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39491 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39492 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39493 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39494 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39495 Subtarget)) {
39496 DstVT = MaskVT;
39497 return true;
39498 }
39499 }
39500
39501 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39502 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39503 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39504 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39505 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39506 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
39507 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39508 Subtarget)) {
39509 SrcVT = DstVT = MaskVT;
39510 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39511 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39512 return true;
39513 }
39514 }
39515
39516 // Attempt to match against a OR if we're performing a blend shuffle and the
39517 // non-blended source element is zero in each case.
39518 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39519 if (SizeInBits == V1.getValueSizeInBits() &&
39520 SizeInBits == V2.getValueSizeInBits() &&
39521 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39522 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39523 bool IsBlend = true;
39524 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39525 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39526 unsigned Scale1 = NumV1Elts / NumMaskElts;
39527 unsigned Scale2 = NumV2Elts / NumMaskElts;
39528 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39529 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39530 for (unsigned i = 0; i != NumMaskElts; ++i) {
39531 int M = Mask[i];
39532 if (M == SM_SentinelUndef)
39533 continue;
39534 if (M == SM_SentinelZero) {
39535 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39536 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39537 continue;
39538 }
39539 if (M == (int)i) {
39540 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39541 continue;
39542 }
39543 if (M == (int)(i + NumMaskElts)) {
39544 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39545 continue;
39546 }
39547 IsBlend = false;
39548 break;
39549 }
39550 if (IsBlend) {
39551 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39552 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39553 Shuffle = ISD::OR;
39554 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39555 return true;
39556 }
39557 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39558 // FIXME: handle mismatched sizes?
39559 // TODO: investigate if `ISD::OR` handling in
39560 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39561 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39562 unsigned NumElts = V.getValueType().getVectorNumElements();
39563 KnownBits Known(NumElts);
39564 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39565 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39566 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39567 if (PeepholeKnown.isZero())
39568 Known.Zero.setBit(EltIdx);
39569 if (PeepholeKnown.isAllOnes())
39570 Known.One.setBit(EltIdx);
39571 }
39572 return Known;
39573 };
39574
39575 KnownBits V1Known = computeKnownBitsElementWise(V1);
39576 KnownBits V2Known = computeKnownBitsElementWise(V2);
39577
39578 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39579 int M = Mask[i];
39580 if (M == SM_SentinelUndef)
39581 continue;
39582 if (M == SM_SentinelZero) {
39583 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39584 continue;
39585 }
39586 if (M == (int)i) {
39587 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39588 continue;
39589 }
39590 if (M == (int)(i + NumMaskElts)) {
39591 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39592 continue;
39593 }
39594 llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39594)
;
39595 }
39596 if (IsBlend) {
39597 Shuffle = ISD::OR;
39598 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39599 return true;
39600 }
39601 }
39602 }
39603 }
39604
39605 return false;
39606}
39607
39608static bool matchBinaryPermuteShuffle(
39609 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39610 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39611 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39612 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39613 unsigned NumMaskElts = Mask.size();
39614 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39615
39616 // Attempt to match against VALIGND/VALIGNQ rotate.
39617 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39618 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39619 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39620 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39621 if (!isAnyZero(Mask)) {
39622 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39623 if (0 < Rotation) {
39624 Shuffle = X86ISD::VALIGN;
39625 if (EltSizeInBits == 64)
39626 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39627 else
39628 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39629 PermuteImm = Rotation;
39630 return true;
39631 }
39632 }
39633 }
39634
39635 // Attempt to match against PALIGNR byte rotate.
39636 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39637 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39638 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39639 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39640 if (0 < ByteRotation) {
39641 Shuffle = X86ISD::PALIGNR;
39642 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39643 PermuteImm = ByteRotation;
39644 return true;
39645 }
39646 }
39647
39648 // Attempt to combine to X86ISD::BLENDI.
39649 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39650 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39651 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39652 uint64_t BlendMask = 0;
39653 bool ForceV1Zero = false, ForceV2Zero = false;
39654 SmallVector<int, 8> TargetMask(Mask);
39655 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39656 ForceV2Zero, BlendMask)) {
39657 if (MaskVT == MVT::v16i16) {
39658 // We can only use v16i16 PBLENDW if the lanes are repeated.
39659 SmallVector<int, 8> RepeatedMask;
39660 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39661 RepeatedMask)) {
39662 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__
__PRETTY_FUNCTION__))
39663 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__
__PRETTY_FUNCTION__))
;
39664 PermuteImm = 0;
39665 for (int i = 0; i < 8; ++i)
39666 if (RepeatedMask[i] >= 8)
39667 PermuteImm |= 1 << i;
39668 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39669 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39670 Shuffle = X86ISD::BLENDI;
39671 ShuffleVT = MaskVT;
39672 return true;
39673 }
39674 } else {
39675 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39676 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39677 PermuteImm = (unsigned)BlendMask;
39678 Shuffle = X86ISD::BLENDI;
39679 ShuffleVT = MaskVT;
39680 return true;
39681 }
39682 }
39683 }
39684
39685 // Attempt to combine to INSERTPS, but only if it has elements that need to
39686 // be set to zero.
39687 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39688 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39689 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39690 Shuffle = X86ISD::INSERTPS;
39691 ShuffleVT = MVT::v4f32;
39692 return true;
39693 }
39694
39695 // Attempt to combine to SHUFPD.
39696 if (AllowFloatDomain && EltSizeInBits == 64 &&
39697 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39698 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39699 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39700 bool ForceV1Zero = false, ForceV2Zero = false;
39701 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39702 PermuteImm, Mask, Zeroable)) {
39703 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39704 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39705 Shuffle = X86ISD::SHUFP;
39706 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39707 return true;
39708 }
39709 }
39710
39711 // Attempt to combine to SHUFPS.
39712 if (AllowFloatDomain && EltSizeInBits == 32 &&
39713 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39714 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39715 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39716 SmallVector<int, 4> RepeatedMask;
39717 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39718 // Match each half of the repeated mask, to determine if its just
39719 // referencing one of the vectors, is zeroable or entirely undef.
39720 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39721 int M0 = RepeatedMask[Offset];
39722 int M1 = RepeatedMask[Offset + 1];
39723
39724 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39725 return DAG.getUNDEF(MaskVT);
39726 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39727 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39728 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39729 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39730 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39731 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39732 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39733 return V1;
39734 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39735 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39736 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39737 return V2;
39738 }
39739
39740 return SDValue();
39741 };
39742
39743 int ShufMask[4] = {-1, -1, -1, -1};
39744 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39745 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39746
39747 if (Lo && Hi) {
39748 V1 = Lo;
39749 V2 = Hi;
39750 Shuffle = X86ISD::SHUFP;
39751 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39752 PermuteImm = getV4X86ShuffleImm(ShufMask);
39753 return true;
39754 }
39755 }
39756 }
39757
39758 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39759 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39760 MaskVT.is128BitVector() &&
39761 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39762 Shuffle = X86ISD::INSERTPS;
39763 ShuffleVT = MVT::v4f32;
39764 return true;
39765 }
39766
39767 return false;
39768}
39769
39770static SDValue combineX86ShuffleChainWithExtract(
39771 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39772 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39773 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39774 const X86Subtarget &Subtarget);
39775
39776/// Combine an arbitrary chain of shuffles into a single instruction if
39777/// possible.
39778///
39779/// This is the leaf of the recursive combine below. When we have found some
39780/// chain of single-use x86 shuffle instructions and accumulated the combined
39781/// shuffle mask represented by them, this will try to pattern match that mask
39782/// into either a single instruction if there is a special purpose instruction
39783/// for this operation, or into a PSHUFB instruction which is a fully general
39784/// instruction but should only be used to replace chains over a certain depth.
39785static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
39786 ArrayRef<int> BaseMask, int Depth,
39787 bool HasVariableMask,
39788 bool AllowVariableCrossLaneMask,
39789 bool AllowVariablePerLaneMask,
39790 SelectionDAG &DAG,
39791 const X86Subtarget &Subtarget) {
39792 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39792, __extension__
__PRETTY_FUNCTION__))
;
39793 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__
__PRETTY_FUNCTION__))
39794 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__
__PRETTY_FUNCTION__))
;
39795
39796 SDLoc DL(Root);
39797 MVT RootVT = Root.getSimpleValueType();
39798 unsigned RootSizeInBits = RootVT.getSizeInBits();
39799 unsigned NumRootElts = RootVT.getVectorNumElements();
39800
39801 // Canonicalize shuffle input op to the requested type.
39802 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39803 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39804 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39805 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39806 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39807 return DAG.getBitcast(VT, Op);
39808 };
39809
39810 // Find the inputs that enter the chain. Note that multiple uses are OK
39811 // here, we're not going to remove the operands we find.
39812 bool UnaryShuffle = (Inputs.size() == 1);
39813 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39814 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39815 : peekThroughBitcasts(Inputs[1]));
39816
39817 MVT VT1 = V1.getSimpleValueType();
39818 MVT VT2 = V2.getSimpleValueType();
39819 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__
__PRETTY_FUNCTION__))
39820 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__
__PRETTY_FUNCTION__))
;
39821
39822 SDValue Res;
39823
39824 unsigned NumBaseMaskElts = BaseMask.size();
39825 if (NumBaseMaskElts == 1) {
39826 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39826, __extension__
__PRETTY_FUNCTION__))
;
39827 return CanonicalizeShuffleInput(RootVT, V1);
39828 }
39829
39830 bool OptForSize = DAG.shouldOptForSize();
39831 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39832 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39833 (RootVT.isFloatingPoint() && Depth >= 1) ||
39834 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39835
39836 // Don't combine if we are a AVX512/EVEX target and the mask element size
39837 // is different from the root element size - this would prevent writemasks
39838 // from being reused.
39839 bool IsMaskedShuffle = false;
39840 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39841 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
39842 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39843 IsMaskedShuffle = true;
39844 }
39845 }
39846
39847 // If we are shuffling a splat (and not introducing zeros) then we can just
39848 // use it directly. This works for smaller elements as well as they already
39849 // repeat across each mask element.
39850 if (UnaryShuffle && !isAnyZero(BaseMask) &&
39851 V1.getValueSizeInBits() >= RootSizeInBits &&
39852 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39853 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
39854 return CanonicalizeShuffleInput(RootVT, V1);
39855 }
39856
39857 SmallVector<int, 64> Mask(BaseMask);
39858
39859 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39860 // etc. can be simplified.
39861 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
39862 SmallVector<int> ScaledMask, IdentityMask;
39863 unsigned NumElts = VT1.getVectorNumElements();
39864 if (Mask.size() <= NumElts &&
39865 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
39866 for (unsigned i = 0; i != NumElts; ++i)
39867 IdentityMask.push_back(i);
39868 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
39869 V2))
39870 return CanonicalizeShuffleInput(RootVT, V1);
39871 }
39872 }
39873
39874 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39875 if (RootVT.is512BitVector() &&
39876 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
39877 // If the upper subvectors are zeroable, then an extract+insert is more
39878 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
39879 // to zero the upper subvectors.
39880 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39881 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39882 return SDValue(); // Nothing to do!
39883 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__
__PRETTY_FUNCTION__))
39884 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__
__PRETTY_FUNCTION__))
;
39885 Res = CanonicalizeShuffleInput(RootVT, V1);
39886 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
39887 bool UseZero = isAnyZero(Mask);
39888 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
39889 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
39890 }
39891
39892 // Narrow shuffle mask to v4x128.
39893 SmallVector<int, 4> ScaledMask;
39894 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39894, __extension__
__PRETTY_FUNCTION__))
;
39895 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
39896
39897 // Try to lower to vshuf64x2/vshuf32x4.
39898 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
39899 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
39900 SelectionDAG &DAG) {
39901 unsigned PermMask = 0;
39902 // Insure elements came from the same Op.
39903 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
39904 for (int i = 0; i < 4; ++i) {
39905 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39905, __extension__
__PRETTY_FUNCTION__))
;
39906 if (ScaledMask[i] < 0)
39907 continue;
39908
39909 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
39910 unsigned OpIndex = i / 2;
39911 if (Ops[OpIndex].isUndef())
39912 Ops[OpIndex] = Op;
39913 else if (Ops[OpIndex] != Op)
39914 return SDValue();
39915
39916 // Convert the 128-bit shuffle mask selection values into 128-bit
39917 // selection bits defined by a vshuf64x2 instruction's immediate control
39918 // byte.
39919 PermMask |= (ScaledMask[i] % 4) << (i * 2);
39920 }
39921
39922 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
39923 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
39924 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
39925 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39926 };
39927
39928 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39929 // doesn't work because our mask is for 128 bits and we don't have an MVT
39930 // to match that.
39931 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39932 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39933 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39934 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39935 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39936 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39937 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39938 ScaledMask[1] == (ScaledMask[3] % 2));
39939
39940 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39941 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39942 return SDValue(); // Nothing to do!
39943 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39944 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39945 return DAG.getBitcast(RootVT, V);
39946 }
39947 }
39948
39949 // Handle 128-bit lane shuffles of 256-bit vectors.
39950 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39951 // If the upper half is zeroable, then an extract+insert is more optimal
39952 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39953 // zero the upper half.
39954 if (isUndefOrZero(Mask[1])) {
39955 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39956 return SDValue(); // Nothing to do!
39957 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39957, __extension__
__PRETTY_FUNCTION__))
;
39958 Res = CanonicalizeShuffleInput(RootVT, V1);
39959 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39960 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39961 256);
39962 }
39963
39964 // If we're inserting the low subvector, an insert-subvector 'concat'
39965 // pattern is quicker than VPERM2X128.
39966 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39967 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39968 !Subtarget.hasAVX2()) {
39969 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39970 return SDValue(); // Nothing to do!
39971 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39972 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39973 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39974 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39975 }
39976
39977 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39978 return SDValue(); // Nothing to do!
39979
39980 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39981 // we need to use the zeroing feature.
39982 // Prefer blends for sequential shuffles unless we are optimizing for size.
39983 if (UnaryShuffle &&
39984 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39985 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39986 unsigned PermMask = 0;
39987 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39988 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39989 return DAG.getNode(
39990 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39991 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39992 }
39993
39994 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39995 return SDValue(); // Nothing to do!
39996
39997 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39998 if (!UnaryShuffle && !IsMaskedShuffle) {
39999 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__
__PRETTY_FUNCTION__))
40000 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__
__PRETTY_FUNCTION__))
;
40001 // Prefer blends to X86ISD::VPERM2X128.
40002 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40003 unsigned PermMask = 0;
40004 PermMask |= ((Mask[0] & 3) << 0);
40005 PermMask |= ((Mask[1] & 3) << 4);
40006 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40007 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40008 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40009 CanonicalizeShuffleInput(RootVT, LHS),
40010 CanonicalizeShuffleInput(RootVT, RHS),
40011 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40012 }
40013 }
40014 }
40015
40016 // For masks that have been widened to 128-bit elements or more,
40017 // narrow back down to 64-bit elements.
40018 if (BaseMaskEltSizeInBits > 64) {
40019 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40019, __extension__
__PRETTY_FUNCTION__))
;
40020 int MaskScale = BaseMaskEltSizeInBits / 64;
40021 SmallVector<int, 64> ScaledMask;
40022 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40023 Mask = std::move(ScaledMask);
40024 }
40025
40026 // For masked shuffles, we're trying to match the root width for better
40027 // writemask folding, attempt to scale the mask.
40028 // TODO - variable shuffles might need this to be widened again.
40029 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40030 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40030, __extension__
__PRETTY_FUNCTION__))
;
40031 int MaskScale = NumRootElts / Mask.size();
40032 SmallVector<int, 64> ScaledMask;
40033 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40034 Mask = std::move(ScaledMask);
40035 }
40036
40037 unsigned NumMaskElts = Mask.size();
40038 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40039
40040 // Determine the effective mask value type.
40041 FloatDomain &= (32 <= MaskEltSizeInBits);
40042 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40043 : MVT::getIntegerVT(MaskEltSizeInBits);
40044 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40045
40046 // Only allow legal mask types.
40047 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40048 return SDValue();
40049
40050 // Attempt to match the mask against known shuffle patterns.
40051 MVT ShuffleSrcVT, ShuffleVT;
40052 unsigned Shuffle, PermuteImm;
40053
40054 // Which shuffle domains are permitted?
40055 // Permit domain crossing at higher combine depths.
40056 // TODO: Should we indicate which domain is preferred if both are allowed?
40057 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40058 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40059 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40060
40061 // Determine zeroable mask elements.
40062 APInt KnownUndef, KnownZero;
40063 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40064 APInt Zeroable = KnownUndef | KnownZero;
40065
40066 if (UnaryShuffle) {
40067 // Attempt to match against broadcast-from-vector.
40068 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40069 if ((Subtarget.hasAVX2() ||
40070 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40071 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40072 if (isUndefOrEqual(Mask, 0)) {
40073 if (V1.getValueType() == MaskVT &&
40074 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40075 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40076 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
40077 return SDValue(); // Nothing to do!
40078 Res = V1.getOperand(0);
40079 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40080 return DAG.getBitcast(RootVT, Res);
40081 }
40082 if (Subtarget.hasAVX2()) {
40083 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
40084 return SDValue(); // Nothing to do!
40085 Res = CanonicalizeShuffleInput(MaskVT, V1);
40086 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40087 return DAG.getBitcast(RootVT, Res);
40088 }
40089 }
40090 }
40091
40092 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40093 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40094 (!IsMaskedShuffle ||
40095 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40096 if (Depth == 0 && Root.getOpcode() == Shuffle)
40097 return SDValue(); // Nothing to do!
40098 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40099 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40100 return DAG.getBitcast(RootVT, Res);
40101 }
40102
40103 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40104 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40105 PermuteImm) &&
40106 (!IsMaskedShuffle ||
40107 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40108 if (Depth == 0 && Root.getOpcode() == Shuffle)
40109 return SDValue(); // Nothing to do!
40110 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40111 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40112 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40113 return DAG.getBitcast(RootVT, Res);
40114 }
40115 }
40116
40117 // Attempt to combine to INSERTPS, but only if the inserted element has come
40118 // from a scalar.
40119 // TODO: Handle other insertions here as well?
40120 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40121 Subtarget.hasSSE41() &&
40122 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40123 if (MaskEltSizeInBits == 32) {
40124 SDValue SrcV1 = V1, SrcV2 = V2;
40125 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40126 DAG) &&
40127 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40128 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
40129 return SDValue(); // Nothing to do!
40130 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40131 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40132 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40133 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40134 return DAG.getBitcast(RootVT, Res);
40135 }
40136 }
40137 if (MaskEltSizeInBits == 64 &&
40138 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40139 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40140 V2.getScalarValueSizeInBits() <= 32) {
40141 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
40142 return SDValue(); // Nothing to do!
40143 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40144 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40145 CanonicalizeShuffleInput(MVT::v4f32, V1),
40146 CanonicalizeShuffleInput(MVT::v4f32, V2),
40147 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40148 return DAG.getBitcast(RootVT, Res);
40149 }
40150 }
40151
40152 SDValue NewV1 = V1; // Save operands in case early exit happens.
40153 SDValue NewV2 = V2;
40154 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40155 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40156 ShuffleVT, UnaryShuffle) &&
40157 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40158 if (Depth == 0 && Root.getOpcode() == Shuffle)
40159 return SDValue(); // Nothing to do!
40160 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40161 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40162 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40163 return DAG.getBitcast(RootVT, Res);
40164 }
40165
40166 NewV1 = V1; // Save operands in case early exit happens.
40167 NewV2 = V2;
40168 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40169 AllowIntDomain, NewV1, NewV2, DL, DAG,
40170 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40171 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40172 if (Depth == 0 && Root.getOpcode() == Shuffle)
40173 return SDValue(); // Nothing to do!
40174 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40175 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40176 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40177 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40178 return DAG.getBitcast(RootVT, Res);
40179 }
40180
40181 // Typically from here on, we need an integer version of MaskVT.
40182 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40183 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40184
40185 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40186 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40187 uint64_t BitLen, BitIdx;
40188 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40189 Zeroable)) {
40190 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
40191 return SDValue(); // Nothing to do!
40192 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40193 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40194 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40195 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40196 return DAG.getBitcast(RootVT, Res);
40197 }
40198
40199 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40200 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
40201 return SDValue(); // Nothing to do!
40202 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40203 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40204 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40205 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40206 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40207 return DAG.getBitcast(RootVT, Res);
40208 }
40209 }
40210
40211 // Match shuffle against TRUNCATE patterns.
40212 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40213 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40214 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40215 Subtarget)) {
40216 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40217 ShuffleSrcVT.getVectorNumElements();
40218 unsigned Opc =
40219 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
40220 if (Depth == 0 && Root.getOpcode() == Opc)
40221 return SDValue(); // Nothing to do!
40222 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40223 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40224 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40225 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40226 return DAG.getBitcast(RootVT, Res);
40227 }
40228
40229 // Do we need a more general binary truncation pattern?
40230 if (RootSizeInBits < 512 &&
40231 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40232 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40233 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40234 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40235 // Bail if this was already a truncation or PACK node.
40236 // We sometimes fail to match PACK if we demand known undef elements.
40237 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
40238 Root.getOpcode() == X86ISD::PACKSS ||
40239 Root.getOpcode() == X86ISD::PACKUS))
40240 return SDValue(); // Nothing to do!
40241 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40242 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40243 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40244 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40245 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40246 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40247 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40248 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40249 return DAG.getBitcast(RootVT, Res);
40250 }
40251 }
40252
40253 // Don't try to re-form single instruction chains under any circumstances now
40254 // that we've done encoding canonicalization for them.
40255 if (Depth < 1)
40256 return SDValue();
40257
40258 // Depth threshold above which we can efficiently use variable mask shuffles.
40259 int VariableCrossLaneShuffleDepth =
40260 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40261 int VariablePerLaneShuffleDepth =
40262 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40263 AllowVariableCrossLaneMask &=
40264 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
40265 AllowVariablePerLaneMask &=
40266 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40267 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
40268 // higher depth before combining them.
40269 bool AllowBWIVPERMV3 =
40270 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40271
40272 bool MaskContainsZeros = isAnyZero(Mask);
40273
40274 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40275 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40276 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40277 if (Subtarget.hasAVX2() &&
40278 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40279 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40280 Res = CanonicalizeShuffleInput(MaskVT, V1);
40281 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40282 return DAG.getBitcast(RootVT, Res);
40283 }
40284 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40285 if ((Subtarget.hasAVX512() &&
40286 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40287 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40288 (Subtarget.hasBWI() &&
40289 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40290 (Subtarget.hasVBMI() &&
40291 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40292 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40293 V2 = DAG.getUNDEF(MaskVT);
40294 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40295 return DAG.getBitcast(RootVT, Res);
40296 }
40297 }
40298
40299 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40300 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40301 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40302 ((Subtarget.hasAVX512() &&
40303 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40304 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40305 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40306 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40307 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40308 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40309 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40310 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40311 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40312 for (unsigned i = 0; i != NumMaskElts; ++i)
40313 if (Mask[i] == SM_SentinelZero)
40314 Mask[i] = NumMaskElts + i;
40315 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40316 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40317 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40318 return DAG.getBitcast(RootVT, Res);
40319 }
40320
40321 // If that failed and either input is extracted then try to combine as a
40322 // shuffle with the larger type.
40323 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40324 Inputs, Root, BaseMask, Depth, HasVariableMask,
40325 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40326 Subtarget))
40327 return WideShuffle;
40328
40329 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40330 // (non-VLX will pad to 512-bit shuffles).
40331 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40332 ((Subtarget.hasAVX512() &&
40333 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40334 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40335 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40336 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40337 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40338 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40339 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40340 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40341 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40342 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40343 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40344 return DAG.getBitcast(RootVT, Res);
40345 }
40346 return SDValue();
40347 }
40348
40349 // See if we can combine a single input shuffle with zeros to a bit-mask,
40350 // which is much simpler than any shuffle.
40351 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40352 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40353 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
40354 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40355 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40356 APInt UndefElts(NumMaskElts, 0);
40357 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40358 for (unsigned i = 0; i != NumMaskElts; ++i) {
40359 int M = Mask[i];
40360 if (M == SM_SentinelUndef) {
40361 UndefElts.setBit(i);
40362 continue;
40363 }
40364 if (M == SM_SentinelZero)
40365 continue;
40366 EltBits[i] = AllOnes;
40367 }
40368 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40369 Res = CanonicalizeShuffleInput(MaskVT, V1);
40370 unsigned AndOpcode =
40371 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
40372 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40373 return DAG.getBitcast(RootVT, Res);
40374 }
40375
40376 // If we have a single input shuffle with different shuffle patterns in the
40377 // the 128-bit lanes use the variable mask to VPERMILPS.
40378 // TODO Combine other mask types at higher depths.
40379 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40380 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40381 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40382 SmallVector<SDValue, 16> VPermIdx;
40383 for (int M : Mask) {
40384 SDValue Idx =
40385 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40386 VPermIdx.push_back(Idx);
40387 }
40388 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40389 Res = CanonicalizeShuffleInput(MaskVT, V1);
40390 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40391 return DAG.getBitcast(RootVT, Res);
40392 }
40393
40394 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40395 // to VPERMIL2PD/VPERMIL2PS.
40396 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40397 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40398 MaskVT == MVT::v8f32)) {
40399 // VPERMIL2 Operation.
40400 // Bits[3] - Match Bit.
40401 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40402 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40403 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40404 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40405 SmallVector<int, 8> VPerm2Idx;
40406 unsigned M2ZImm = 0;
40407 for (int M : Mask) {
40408 if (M == SM_SentinelUndef) {
40409 VPerm2Idx.push_back(-1);
40410 continue;
40411 }
40412 if (M == SM_SentinelZero) {
40413 M2ZImm = 2;
40414 VPerm2Idx.push_back(8);
40415 continue;
40416 }
40417 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40418 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40419 VPerm2Idx.push_back(Index);
40420 }
40421 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40422 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40423 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40424 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40425 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40426 return DAG.getBitcast(RootVT, Res);
40427 }
40428
40429 // If we have 3 or more shuffle instructions or a chain involving a variable
40430 // mask, we can replace them with a single PSHUFB instruction profitably.
40431 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40432 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40433 // more aggressive.
40434 if (UnaryShuffle && AllowVariablePerLaneMask &&
40435 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40436 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40437 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40438 SmallVector<SDValue, 16> PSHUFBMask;
40439 int NumBytes = RootVT.getSizeInBits() / 8;
40440 int Ratio = NumBytes / NumMaskElts;
40441 for (int i = 0; i < NumBytes; ++i) {
40442 int M = Mask[i / Ratio];
40443 if (M == SM_SentinelUndef) {
40444 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40445 continue;
40446 }
40447 if (M == SM_SentinelZero) {
40448 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40449 continue;
40450 }
40451 M = Ratio * M + i % Ratio;
40452 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40452, __extension__
__PRETTY_FUNCTION__))
;
40453 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40454 }
40455 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40456 Res = CanonicalizeShuffleInput(ByteVT, V1);
40457 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40458 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40459 return DAG.getBitcast(RootVT, Res);
40460 }
40461
40462 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40463 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40464 // slower than PSHUFB on targets that support both.
40465 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40466 Subtarget.hasXOP()) {
40467 // VPPERM Mask Operation
40468 // Bits[4:0] - Byte Index (0 - 31)
40469 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40470 SmallVector<SDValue, 16> VPPERMMask;
40471 int NumBytes = 16;
40472 int Ratio = NumBytes / NumMaskElts;
40473 for (int i = 0; i < NumBytes; ++i) {
40474 int M = Mask[i / Ratio];
40475 if (M == SM_SentinelUndef) {
40476 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40477 continue;
40478 }
40479 if (M == SM_SentinelZero) {
40480 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40481 continue;
40482 }
40483 M = Ratio * M + i % Ratio;
40484 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40485 }
40486 MVT ByteVT = MVT::v16i8;
40487 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40488 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40489 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40490 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40491 return DAG.getBitcast(RootVT, Res);
40492 }
40493
40494 // If that failed and either input is extracted then try to combine as a
40495 // shuffle with the larger type.
40496 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40497 Inputs, Root, BaseMask, Depth, HasVariableMask,
40498 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40499 return WideShuffle;
40500
40501 // If we have a dual input shuffle then lower to VPERMV3,
40502 // (non-VLX will pad to 512-bit shuffles)
40503 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40504 ((Subtarget.hasAVX512() &&
40505 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40506 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40507 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40508 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40509 MaskVT == MVT::v16i32)) ||
40510 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40511 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40512 MaskVT == MVT::v32i16)) ||
40513 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40514 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40515 MaskVT == MVT::v64i8)))) {
40516 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40517 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40518 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40519 return DAG.getBitcast(RootVT, Res);
40520 }
40521
40522 // Failed to find any combines.
40523 return SDValue();
40524}
40525
40526// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40527// instruction if possible.
40528//
40529// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40530// type size to attempt to combine:
40531// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40532// -->
40533// extract_subvector(shuffle(x,y,m2),0)
40534static SDValue combineX86ShuffleChainWithExtract(
40535 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40536 bool HasVariableMask, bool AllowVariableCrossLaneMask,
40537 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40538 const X86Subtarget &Subtarget) {
40539 unsigned NumMaskElts = BaseMask.size();
40540 unsigned NumInputs = Inputs.size();
40541 if (NumInputs == 0)
40542 return SDValue();
40543
40544 EVT RootVT = Root.getValueType();
40545 unsigned RootSizeInBits = RootVT.getSizeInBits();
40546 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40547 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40547, __extension__
__PRETTY_FUNCTION__))
;
40548
40549 // Peek through extract_subvector to find widest legal vector.
40550 // TODO: Handle ISD::TRUNCATE
40551 unsigned WideSizeInBits = RootSizeInBits;
40552 for (unsigned I = 0; I != NumInputs; ++I) {
40553 SDValue Input = peekThroughBitcasts(Inputs[I]);
40554 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
40555 Input = peekThroughBitcasts(Input.getOperand(0));
40556 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40557 WideSizeInBits < Input.getValueSizeInBits())
40558 WideSizeInBits = Input.getValueSizeInBits();
40559 }
40560
40561 // Bail if we fail to find a source larger than the existing root.
40562 unsigned Scale = WideSizeInBits / RootSizeInBits;
40563 if (WideSizeInBits <= RootSizeInBits ||
40564 (WideSizeInBits % RootSizeInBits) != 0)
40565 return SDValue();
40566
40567 // Create new mask for larger type.
40568 SmallVector<int, 64> WideMask(BaseMask);
40569 for (int &M : WideMask) {
40570 if (M < 0)
40571 continue;
40572 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40573 }
40574 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40575
40576 // Attempt to peek through inputs and adjust mask when we extract from an
40577 // upper subvector.
40578 int AdjustedMasks = 0;
40579 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
40580 for (unsigned I = 0; I != NumInputs; ++I) {
40581 SDValue &Input = WideInputs[I];
40582 Input = peekThroughBitcasts(Input);
40583 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40584 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40585 uint64_t Idx = Input.getConstantOperandVal(1);
40586 if (Idx != 0) {
40587 ++AdjustedMasks;
40588 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40589 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40590
40591 int lo = I * WideMask.size();
40592 int hi = (I + 1) * WideMask.size();
40593 for (int &M : WideMask)
40594 if (lo <= M && M < hi)
40595 M += Idx;
40596 }
40597 Input = peekThroughBitcasts(Input.getOperand(0));
40598 }
40599 }
40600
40601 // Remove unused/repeated shuffle source ops.
40602 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40603 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40603, __extension__
__PRETTY_FUNCTION__))
;
40604
40605 // Bail if we're always extracting from the lowest subvectors,
40606 // combineX86ShuffleChain should match this for the current width, or the
40607 // shuffle still references too many inputs.
40608 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40609 return SDValue();
40610
40611 // Minor canonicalization of the accumulated shuffle mask to make it easier
40612 // to match below. All this does is detect masks with sequential pairs of
40613 // elements, and shrink them to the half-width mask. It does this in a loop
40614 // so it will reduce the size of the mask to the minimal width mask which
40615 // performs an equivalent shuffle.
40616 while (WideMask.size() > 1) {
40617 SmallVector<int, 64> WidenedMask;
40618 if (!canWidenShuffleElements(WideMask, WidenedMask))
40619 break;
40620 WideMask = std::move(WidenedMask);
40621 }
40622
40623 // Canonicalization of binary shuffle masks to improve pattern matching by
40624 // commuting the inputs.
40625 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40626 ShuffleVectorSDNode::commuteMask(WideMask);
40627 std::swap(WideInputs[0], WideInputs[1]);
40628 }
40629
40630 // Increase depth for every upper subvector we've peeked through.
40631 Depth += AdjustedMasks;
40632
40633 // Attempt to combine wider chain.
40634 // TODO: Can we use a better Root?
40635 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40636 WideInputs.back().getValueSizeInBits()
40637 ? WideInputs.front()
40638 : WideInputs.back();
40639 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__))
40640 "WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__))
;
40641
40642 if (SDValue WideShuffle =
40643 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40644 HasVariableMask, AllowVariableCrossLaneMask,
40645 AllowVariablePerLaneMask, DAG, Subtarget)) {
40646 WideShuffle =
40647 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40648 return DAG.getBitcast(RootVT, WideShuffle);
40649 }
40650
40651 return SDValue();
40652}
40653
40654// Canonicalize the combined shuffle mask chain with horizontal ops.
40655// NOTE: This may update the Ops and Mask.
40656static SDValue canonicalizeShuffleMaskWithHorizOp(
40657 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
40658 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40659 const X86Subtarget &Subtarget) {
40660 if (Mask.empty() || Ops.empty())
40661 return SDValue();
40662
40663 SmallVector<SDValue> BC;
40664 for (SDValue Op : Ops)
40665 BC.push_back(peekThroughBitcasts(Op));
40666
40667 // All ops must be the same horizop + type.
40668 SDValue BC0 = BC[0];
40669 EVT VT0 = BC0.getValueType();
40670 unsigned Opcode0 = BC0.getOpcode();
40671 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40672 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40673 }))
40674 return SDValue();
40675
40676 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40677 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40678 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40679 if (!isHoriz && !isPack)
40680 return SDValue();
40681
40682 // Do all ops have a single use?
40683 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40684 return Op.hasOneUse() &&
40685 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
40686 });
40687
40688 int NumElts = VT0.getVectorNumElements();
40689 int NumLanes = VT0.getSizeInBits() / 128;
40690 int NumEltsPerLane = NumElts / NumLanes;
40691 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40692 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40693 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40694
40695 if (NumEltsPerLane >= 4 &&
40696 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40697 SmallVector<int> LaneMask, ScaledMask;
40698 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40699 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40700 // See if we can remove the shuffle by resorting the HOP chain so that
40701 // the HOP args are pre-shuffled.
40702 // TODO: Generalize to any sized/depth chain.
40703 // TODO: Add support for PACKSS/PACKUS.
40704 if (isHoriz) {
40705 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40706 auto GetHOpSrc = [&](int M) {
40707 if (M == SM_SentinelUndef)
40708 return DAG.getUNDEF(VT0);
40709 if (M == SM_SentinelZero)
40710 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40711 SDValue Src0 = BC[M / 4];
40712 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40713 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40714 return Src1.getOperand(M % 2);
40715 return SDValue();
40716 };
40717 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40718 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40719 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40720 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40721 if (M0 && M1 && M2 && M3) {
40722 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40723 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40724 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40725 }
40726 }
40727 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40728 if (Ops.size() >= 2) {
40729 SDValue LHS, RHS;
40730 auto GetHOpSrc = [&](int M, int &OutM) {
40731 // TODO: Support SM_SentinelZero
40732 if (M < 0)
40733 return M == SM_SentinelUndef;
40734 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40735 if (!LHS || LHS == Src) {
40736 LHS = Src;
40737 OutM = (M % 2);
40738 return true;
40739 }
40740 if (!RHS || RHS == Src) {
40741 RHS = Src;
40742 OutM = (M % 2) + 2;
40743 return true;
40744 }
40745 return false;
40746 };
40747 int PostMask[4] = {-1, -1, -1, -1};
40748 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40749 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40750 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40751 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40752 LHS = DAG.getBitcast(SrcVT, LHS);
40753 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40754 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40755 // Use SHUFPS for the permute so this will work on SSE3 targets,
40756 // shuffle combining and domain handling will simplify this later on.
40757 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40758 Res = DAG.getBitcast(ShuffleVT, Res);
40759 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40760 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40761 }
40762 }
40763 }
40764 }
40765
40766 if (2 < Ops.size())
40767 return SDValue();
40768
40769 SDValue BC1 = BC[BC.size() - 1];
40770 if (Mask.size() == VT0.getVectorNumElements()) {
40771 // Canonicalize binary shuffles of horizontal ops that use the
40772 // same sources to an unary shuffle.
40773 // TODO: Try to perform this fold even if the shuffle remains.
40774 if (Ops.size() == 2) {
40775 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40776 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40777 };
40778 // Commute if all BC0's ops are contained in BC1.
40779 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40780 ContainsOps(BC1, BC0.getOperand(1))) {
40781 ShuffleVectorSDNode::commuteMask(Mask);
40782 std::swap(Ops[0], Ops[1]);
40783 std::swap(BC0, BC1);
40784 }
40785
40786 // If BC1 can be represented by BC0, then convert to unary shuffle.
40787 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40788 ContainsOps(BC0, BC1.getOperand(1))) {
40789 for (int &M : Mask) {
40790 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40791 continue;
40792 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40793 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40794 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40795 M += NumHalfEltsPerLane;
40796 }
40797 }
40798 }
40799
40800 // Canonicalize unary horizontal ops to only refer to lower halves.
40801 for (int i = 0; i != NumElts; ++i) {
40802 int &M = Mask[i];
40803 if (isUndefOrZero(M))
40804 continue;
40805 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40806 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40807 M -= NumHalfEltsPerLane;
40808 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40809 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40810 M -= NumHalfEltsPerLane;
40811 }
40812 }
40813
40814 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40815 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40816 // represents the LHS/RHS inputs for the lower/upper halves.
40817 SmallVector<int, 16> TargetMask128, WideMask128;
40818 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40819 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40820 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40820, __extension__
__PRETTY_FUNCTION__))
;
40821 bool SingleOp = (Ops.size() == 1);
40822 if (isPack || OneUseOps ||
40823 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40824 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40825 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40826 Lo = Lo.getOperand(WideMask128[0] & 1);
40827 Hi = Hi.getOperand(WideMask128[1] & 1);
40828 if (SingleOp) {
40829 SDValue Undef = DAG.getUNDEF(SrcVT);
40830 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40831 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40832 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40833 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40834 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40835 }
40836 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40837 }
40838 }
40839
40840 return SDValue();
40841}
40842
40843// Attempt to constant fold all of the constant source ops.
40844// Returns true if the entire shuffle is folded to a constant.
40845// TODO: Extend this to merge multiple constant Ops and update the mask.
40846static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
40847 ArrayRef<int> Mask, SDValue Root,
40848 bool HasVariableMask,
40849 SelectionDAG &DAG,
40850 const X86Subtarget &Subtarget) {
40851 MVT VT = Root.getSimpleValueType();
40852
40853 unsigned SizeInBits = VT.getSizeInBits();
40854 unsigned NumMaskElts = Mask.size();
40855 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
40856 unsigned NumOps = Ops.size();
40857
40858 // Extract constant bits from each source op.
40859 SmallVector<APInt, 16> UndefEltsOps(NumOps);
40860 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
40861 for (unsigned I = 0; I != NumOps; ++I)
40862 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
40863 RawBitsOps[I]))
40864 return SDValue();
40865
40866 // If we're optimizing for size, only fold if at least one of the constants is
40867 // only used once or the combined shuffle has included a variable mask
40868 // shuffle, this is to avoid constant pool bloat.
40869 bool IsOptimizingSize = DAG.shouldOptForSize();
40870 if (IsOptimizingSize && !HasVariableMask &&
40871 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40872 return SDValue();
40873
40874 // Shuffle the constant bits according to the mask.
40875 SDLoc DL(Root);
40876 APInt UndefElts(NumMaskElts, 0);
40877 APInt ZeroElts(NumMaskElts, 0);
40878 APInt ConstantElts(NumMaskElts, 0);
40879 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
40880 APInt::getZero(MaskSizeInBits));
40881 for (unsigned i = 0; i != NumMaskElts; ++i) {
40882 int M = Mask[i];
40883 if (M == SM_SentinelUndef) {
40884 UndefElts.setBit(i);
40885 continue;
40886 } else if (M == SM_SentinelZero) {
40887 ZeroElts.setBit(i);
40888 continue;
40889 }
40890 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40890, __extension__
__PRETTY_FUNCTION__))
;
40891
40892 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
40893 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
40894
40895 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
40896 if (SrcUndefElts[SrcMaskIdx]) {
40897 UndefElts.setBit(i);
40898 continue;
40899 }
40900
40901 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
40902 APInt &Bits = SrcEltBits[SrcMaskIdx];
40903 if (!Bits) {
40904 ZeroElts.setBit(i);
40905 continue;
40906 }
40907
40908 ConstantElts.setBit(i);
40909 ConstantBitData[i] = Bits;
40910 }
40911 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40911, __extension__
__PRETTY_FUNCTION__))
;
40912
40913 // Attempt to create a zero vector.
40914 if ((UndefElts | ZeroElts).isAllOnes())
40915 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
40916
40917 // Create the constant data.
40918 MVT MaskSVT;
40919 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
40920 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
40921 else
40922 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
40923
40924 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
40925 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40926 return SDValue();
40927
40928 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
40929 return DAG.getBitcast(VT, CstOp);
40930}
40931
40932namespace llvm {
40933 namespace X86 {
40934 enum {
40935 MaxShuffleCombineDepth = 8
40936 };
40937 }
40938} // namespace llvm
40939
40940/// Fully generic combining of x86 shuffle instructions.
40941///
40942/// This should be the last combine run over the x86 shuffle instructions. Once
40943/// they have been fully optimized, this will recursively consider all chains
40944/// of single-use shuffle instructions, build a generic model of the cumulative
40945/// shuffle operation, and check for simpler instructions which implement this
40946/// operation. We use this primarily for two purposes:
40947///
40948/// 1) Collapse generic shuffles to specialized single instructions when
40949/// equivalent. In most cases, this is just an encoding size win, but
40950/// sometimes we will collapse multiple generic shuffles into a single
40951/// special-purpose shuffle.
40952/// 2) Look for sequences of shuffle instructions with 3 or more total
40953/// instructions, and replace them with the slightly more expensive SSSE3
40954/// PSHUFB instruction if available. We do this as the last combining step
40955/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40956/// a suitable short sequence of other instructions. The PSHUFB will either
40957/// use a register or have to read from memory and so is slightly (but only
40958/// slightly) more expensive than the other shuffle instructions.
40959///
40960/// Because this is inherently a quadratic operation (for each shuffle in
40961/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40962/// This should never be an issue in practice as the shuffle lowering doesn't
40963/// produce sequences of more than 8 instructions.
40964///
40965/// FIXME: We will currently miss some cases where the redundant shuffling
40966/// would simplify under the threshold for PSHUFB formation because of
40967/// combine-ordering. To fix this, we should do the redundant instruction
40968/// combining in this recursive walk.
40969static SDValue combineX86ShufflesRecursively(
40970 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40971 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40972 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40973 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40974 const X86Subtarget &Subtarget) {
40975 assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))
40976 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))
40977 "Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))
;
40978 MVT RootVT = Root.getSimpleValueType();
40979 assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40979, __extension__
__PRETTY_FUNCTION__))
;
40980 unsigned RootSizeInBits = RootVT.getSizeInBits();
40981
40982 // Bound the depth of our recursive combine because this is ultimately
40983 // quadratic in nature.
40984 if (Depth >= MaxDepth)
40985 return SDValue();
40986
40987 // Directly rip through bitcasts to find the underlying operand.
40988 SDValue Op = SrcOps[SrcOpIndex];
40989 Op = peekThroughOneUseBitcasts(Op);
40990
40991 EVT VT = Op.getValueType();
40992 if (!VT.isVector() || !VT.isSimple())
40993 return SDValue(); // Bail if we hit a non-simple non-vector.
40994
40995 // FIXME: Just bail on f16 for now.
40996 if (VT.getVectorElementType() == MVT::f16)
40997 return SDValue();
40998
40999 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__
__PRETTY_FUNCTION__))
41000 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__
__PRETTY_FUNCTION__))
;
41001
41002 // Create a demanded elts mask from the referenced elements of Op.
41003 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41004 for (int M : RootMask) {
41005 int BaseIdx = RootMask.size() * SrcOpIndex;
41006 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41007 OpDemandedElts.setBit(M - BaseIdx);
41008 }
41009 if (RootSizeInBits != VT.getSizeInBits()) {
41010 // Op is smaller than Root - extract the demanded elts for the subvector.
41011 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41012 unsigned NumOpMaskElts = RootMask.size() / Scale;
41013 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41013, __extension__
__PRETTY_FUNCTION__))
;
41014 assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
41015 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
41016 .isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
41017 "Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
;
41018 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41019 }
41020 OpDemandedElts =
41021 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41022
41023 // Extract target shuffle mask and resolve sentinels and inputs.
41024 SmallVector<int, 64> OpMask;
41025 SmallVector<SDValue, 2> OpInputs;
41026 APInt OpUndef, OpZero;
41027 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
41028 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41029 OpZero, DAG, Depth, false)) {
41030 // Shuffle inputs must not be larger than the shuffle result.
41031 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41032 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41033 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41034 }))
41035 return SDValue();
41036 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41037 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41038 !isNullConstant(Op.getOperand(1))) {
41039 SDValue SrcVec = Op.getOperand(0);
41040 int ExtractIdx = Op.getConstantOperandVal(1);
41041 unsigned NumElts = VT.getVectorNumElements();
41042 OpInputs.assign({SrcVec});
41043 OpMask.assign(NumElts, SM_SentinelUndef);
41044 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41045 OpZero = OpUndef = APInt::getZero(NumElts);
41046 } else {
41047 return SDValue();
41048 }
41049
41050 // If the shuffle result was smaller than the root, we need to adjust the
41051 // mask indices and pad the mask with undefs.
41052 if (RootSizeInBits > VT.getSizeInBits()) {
41053 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41054 unsigned OpMaskSize = OpMask.size();
41055 if (OpInputs.size() > 1) {
41056 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41057 for (int &M : OpMask) {
41058 if (M < 0)
41059 continue;
41060 int EltIdx = M % OpMaskSize;
41061 int OpIdx = M / OpMaskSize;
41062 M = (PaddedMaskSize * OpIdx) + EltIdx;
41063 }
41064 }
41065 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41066 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41067 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41068 }
41069
41070 SmallVector<int, 64> Mask;
41071 SmallVector<SDValue, 16> Ops;
41072
41073 // We don't need to merge masks if the root is empty.
41074 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41075 if (EmptyRoot) {
41076 // Only resolve zeros if it will remove an input, otherwise we might end
41077 // up in an infinite loop.
41078 bool ResolveKnownZeros = true;
41079 if (!OpZero.isZero()) {
41080 APInt UsedInputs = APInt::getZero(OpInputs.size());
41081 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41082 int M = OpMask[i];
41083 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41084 continue;
41085 UsedInputs.setBit(M / OpMask.size());
41086 if (UsedInputs.isAllOnes()) {
41087 ResolveKnownZeros = false;
41088 break;
41089 }
41090 }
41091 }
41092 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41093 ResolveKnownZeros);
41094
41095 Mask = OpMask;
41096 Ops.append(OpInputs.begin(), OpInputs.end());
41097 } else {
41098 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41099
41100 // Add the inputs to the Ops list, avoiding duplicates.
41101 Ops.append(SrcOps.begin(), SrcOps.end());
41102
41103 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41104 // Attempt to find an existing match.
41105 SDValue InputBC = peekThroughBitcasts(Input);
41106 for (int i = 0, e = Ops.size(); i < e; ++i)
41107 if (InputBC == peekThroughBitcasts(Ops[i]))
41108 return i;
41109 // Match failed - should we replace an existing Op?
41110 if (InsertionPoint >= 0) {
41111 Ops[InsertionPoint] = Input;
41112 return InsertionPoint;
41113 }
41114 // Add to the end of the Ops list.
41115 Ops.push_back(Input);
41116 return Ops.size() - 1;
41117 };
41118
41119 SmallVector<int, 2> OpInputIdx;
41120 for (SDValue OpInput : OpInputs)
41121 OpInputIdx.push_back(
41122 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41123
41124 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41125 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41126 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41127 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41128 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41129 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
;
41130
41131 // This function can be performance-critical, so we rely on the power-of-2
41132 // knowledge that we have about the mask sizes to replace div/rem ops with
41133 // bit-masks and shifts.
41134 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__
__PRETTY_FUNCTION__))
41135 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__
__PRETTY_FUNCTION__))
;
41136 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__
__PRETTY_FUNCTION__))
41137 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__
__PRETTY_FUNCTION__))
;
41138 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41139 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41140
41141 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41142 unsigned RootRatio =
41143 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41144 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41145 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__
__PRETTY_FUNCTION__))
41146 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__
__PRETTY_FUNCTION__))
;
41147
41148 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41148, __extension__
__PRETTY_FUNCTION__))
;
41149 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41149, __extension__
__PRETTY_FUNCTION__))
;
41150 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41150, __extension__
__PRETTY_FUNCTION__))
;
41151 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41152 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41153
41154 Mask.resize(MaskWidth, SM_SentinelUndef);
41155
41156 // Merge this shuffle operation's mask into our accumulated mask. Note that
41157 // this shuffle's mask will be the first applied to the input, followed by
41158 // the root mask to get us all the way to the root value arrangement. The
41159 // reason for this order is that we are recursing up the operation chain.
41160 for (unsigned i = 0; i < MaskWidth; ++i) {
41161 unsigned RootIdx = i >> RootRatioLog2;
41162 if (RootMask[RootIdx] < 0) {
41163 // This is a zero or undef lane, we're done.
41164 Mask[i] = RootMask[RootIdx];
41165 continue;
41166 }
41167
41168 unsigned RootMaskedIdx =
41169 RootRatio == 1
41170 ? RootMask[RootIdx]
41171 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41172
41173 // Just insert the scaled root mask value if it references an input other
41174 // than the SrcOp we're currently inserting.
41175 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41176 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41177 Mask[i] = RootMaskedIdx;
41178 continue;
41179 }
41180
41181 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41182 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41183 if (OpMask[OpIdx] < 0) {
41184 // The incoming lanes are zero or undef, it doesn't matter which ones we
41185 // are using.
41186 Mask[i] = OpMask[OpIdx];
41187 continue;
41188 }
41189
41190 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41191 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41192 : (OpMask[OpIdx] << OpRatioLog2) +
41193 (RootMaskedIdx & (OpRatio - 1));
41194
41195 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41196 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41197 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41197, __extension__
__PRETTY_FUNCTION__))
;
41198 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41199
41200 Mask[i] = OpMaskedIdx;
41201 }
41202 }
41203
41204 // Peek through vector widenings and set out of bounds mask indices to undef.
41205 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41206 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
41207 SDValue &Op = Ops[I];
41208 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
41209 isNullConstant(Op.getOperand(2))) {
41210 Op = Op.getOperand(1);
41211 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41212 int Lo = I * Mask.size();
41213 int Hi = (I + 1) * Mask.size();
41214 int NewHi = Lo + (Mask.size() / Scale);
41215 for (int &M : Mask) {
41216 if (Lo <= M && NewHi <= M && M < Hi)
41217 M = SM_SentinelUndef;
41218 }
41219 }
41220 }
41221
41222 // Peek through any free extract_subvector nodes back to root size.
41223 for (SDValue &Op : Ops)
41224 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41225 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41226 isNullConstant(Op.getOperand(1)))
41227 Op = Op.getOperand(0);
41228
41229 // Remove unused/repeated shuffle source ops.
41230 resolveTargetShuffleInputsAndMask(Ops, Mask);
41231
41232 // Handle the all undef/zero/ones cases early.
41233 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41234 return DAG.getUNDEF(RootVT);
41235 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41236 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
41237 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41238 !llvm::is_contained(Mask, SM_SentinelZero))
41239 return getOnesVector(RootVT, DAG, SDLoc(Root));
41240
41241 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41241, __extension__
__PRETTY_FUNCTION__))
;
41242 HasVariableMask |= IsOpVariableMask;
41243
41244 // Update the list of shuffle nodes that have been combined so far.
41245 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
41246 SrcNodes.end());
41247 CombinedNodes.push_back(Op.getNode());
41248
41249 // See if we can recurse into each shuffle source op (if it's a target
41250 // shuffle). The source op should only be generally combined if it either has
41251 // a single use (i.e. current Op) or all its users have already been combined,
41252 // if not then we can still combine but should prevent generation of variable
41253 // shuffles to avoid constant pool bloat.
41254 // Don't recurse if we already have more source ops than we can combine in
41255 // the remaining recursion depth.
41256 if (Ops.size() < (MaxDepth - Depth)) {
41257 for (int i = 0, e = Ops.size(); i < e; ++i) {
41258 // For empty roots, we need to resolve zeroable elements before combining
41259 // them with other shuffles.
41260 SmallVector<int, 64> ResolvedMask = Mask;
41261 if (EmptyRoot)
41262 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41263 bool AllowCrossLaneVar = false;
41264 bool AllowPerLaneVar = false;
41265 if (Ops[i].getNode()->hasOneUse() ||
41266 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41267 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41268 AllowPerLaneVar = AllowVariablePerLaneMask;
41269 }
41270 if (SDValue Res = combineX86ShufflesRecursively(
41271 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41272 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41273 Subtarget))
41274 return Res;
41275 }
41276 }
41277
41278 // Attempt to constant fold all of the constant source ops.
41279 if (SDValue Cst = combineX86ShufflesConstants(
41280 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
41281 return Cst;
41282
41283 // If constant fold failed and we only have constants - then we have
41284 // multiple uses by a single non-variable shuffle - just bail.
41285 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41286 APInt UndefElts;
41287 SmallVector<APInt> RawBits;
41288 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41289 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41290 RawBits);
41291 })) {
41292 return SDValue();
41293 }
41294
41295 // Canonicalize the combined shuffle mask chain with horizontal ops.
41296 // NOTE: This will update the Ops and Mask.
41297 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
41298 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
41299 return DAG.getBitcast(RootVT, HOp);
41300
41301 // Try to refine our inputs given our knowledge of target shuffle mask.
41302 for (auto I : enumerate(Ops)) {
41303 int OpIdx = I.index();
41304 SDValue &Op = I.value();
41305
41306 // What range of shuffle mask element values results in picking from Op?
41307 int Lo = OpIdx * Mask.size();
41308 int Hi = Lo + Mask.size();
41309
41310 // Which elements of Op do we demand, given the mask's granularity?
41311 APInt OpDemandedElts(Mask.size(), 0);
41312 for (int MaskElt : Mask) {
41313 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41314 int OpEltIdx = MaskElt - Lo;
41315 OpDemandedElts.setBit(OpEltIdx);
41316 }
41317 }
41318
41319 // Is the shuffle result smaller than the root?
41320 if (Op.getValueSizeInBits() < RootSizeInBits) {
41321 // We padded the mask with undefs. But we now need to undo that.
41322 unsigned NumExpectedVectorElts = Mask.size();
41323 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41324 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41325 assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))
41326 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))
41327 "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))
;
41328 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41329 }
41330
41331 // The Op itself may be of different VT, so we need to scale the mask.
41332 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41333 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41334
41335 // Can this operand be simplified any further, given it's demanded elements?
41336 if (SDValue NewOp =
41337 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
41338 Op, OpScaledDemandedElts, DAG))
41339 Op = NewOp;
41340 }
41341 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41342
41343 // Widen any subvector shuffle inputs we've collected.
41344 // TODO: Remove this to avoid generating temporary nodes, we should only
41345 // widen once combineX86ShuffleChain has found a match.
41346 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41347 return Op.getValueSizeInBits() < RootSizeInBits;
41348 })) {
41349 for (SDValue &Op : Ops)
41350 if (Op.getValueSizeInBits() < RootSizeInBits)
41351 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41352 RootSizeInBits);
41353 // Reresolve - we might have repeated subvector sources.
41354 resolveTargetShuffleInputsAndMask(Ops, Mask);
41355 }
41356
41357 // We can only combine unary and binary shuffle mask cases.
41358 if (Ops.size() <= 2) {
41359 // Minor canonicalization of the accumulated shuffle mask to make it easier
41360 // to match below. All this does is detect masks with sequential pairs of
41361 // elements, and shrink them to the half-width mask. It does this in a loop
41362 // so it will reduce the size of the mask to the minimal width mask which
41363 // performs an equivalent shuffle.
41364 while (Mask.size() > 1) {
41365 SmallVector<int, 64> WidenedMask;
41366 if (!canWidenShuffleElements(Mask, WidenedMask))
41367 break;
41368 Mask = std::move(WidenedMask);
41369 }
41370
41371 // Canonicalization of binary shuffle masks to improve pattern matching by
41372 // commuting the inputs.
41373 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41374 ShuffleVectorSDNode::commuteMask(Mask);
41375 std::swap(Ops[0], Ops[1]);
41376 }
41377
41378 // Try to combine into a single shuffle instruction.
41379 if (SDValue Shuffle = combineX86ShuffleChain(
41380 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41381 AllowVariablePerLaneMask, DAG, Subtarget))
41382 return Shuffle;
41383
41384 // If all the operands come from the same larger vector, fallthrough and try
41385 // to use combineX86ShuffleChainWithExtract.
41386 SDValue LHS = peekThroughBitcasts(Ops.front());
41387 SDValue RHS = peekThroughBitcasts(Ops.back());
41388 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41389 (RootSizeInBits / Mask.size()) != 64 ||
41390 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41391 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41392 LHS.getOperand(0) != RHS.getOperand(0))
41393 return SDValue();
41394 }
41395
41396 // If that failed and any input is extracted then try to combine as a
41397 // shuffle with the larger type.
41398 return combineX86ShuffleChainWithExtract(
41399 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41400 AllowVariablePerLaneMask, DAG, Subtarget);
41401}
41402
41403/// Helper entry wrapper to combineX86ShufflesRecursively.
41404static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
41405 const X86Subtarget &Subtarget) {
41406 return combineX86ShufflesRecursively(
41407 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41408 /*HasVarMask*/ false,
41409 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41410 Subtarget);
41411}
41412
41413/// Get the PSHUF-style mask from PSHUF node.
41414///
41415/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41416/// PSHUF-style masks that can be reused with such instructions.
41417static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
41418 MVT VT = N.getSimpleValueType();
41419 SmallVector<int, 4> Mask;
41420 SmallVector<SDValue, 2> Ops;
41421 bool HaveMask =
41422 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
41423 (void)HaveMask;
41424 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41424
, __extension__ __PRETTY_FUNCTION__))
;
41425
41426 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41427 // matter. Check that the upper masks are repeats and remove them.
41428 if (VT.getSizeInBits() > 128) {
41429 int LaneElts = 128 / VT.getScalarSizeInBits();
41430#ifndef NDEBUG
41431 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41432 for (int j = 0; j < LaneElts; ++j)
41433 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__
__PRETTY_FUNCTION__))
41434 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__
__PRETTY_FUNCTION__))
;
41435#endif
41436 Mask.resize(LaneElts);
41437 }
41438
41439 switch (N.getOpcode()) {
41440 case X86ISD::PSHUFD:
41441 return Mask;
41442 case X86ISD::PSHUFLW:
41443 Mask.resize(4);
41444 return Mask;
41445 case X86ISD::PSHUFHW:
41446 Mask.erase(Mask.begin(), Mask.begin() + 4);
41447 for (int &M : Mask)
41448 M -= 4;
41449 return Mask;
41450 default:
41451 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41451)
;
41452 }
41453}
41454
41455/// Search for a combinable shuffle across a chain ending in pshufd.
41456///
41457/// We walk up the chain and look for a combinable shuffle, skipping over
41458/// shuffles that we could hoist this shuffle's transformation past without
41459/// altering anything.
41460static SDValue
41461combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
41462 SelectionDAG &DAG) {
41463 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__
__PRETTY_FUNCTION__))
41464 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__
__PRETTY_FUNCTION__))
;
41465 SDLoc DL(N);
41466
41467 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41468 // of the shuffles in the chain so that we can form a fresh chain to replace
41469 // this one.
41470 SmallVector<SDValue, 8> Chain;
41471 SDValue V = N.getOperand(0);
41472 for (; V.hasOneUse(); V = V.getOperand(0)) {
41473 switch (V.getOpcode()) {
41474 default:
41475 return SDValue(); // Nothing combined!
41476
41477 case ISD::BITCAST:
41478 // Skip bitcasts as we always know the type for the target specific
41479 // instructions.
41480 continue;
41481
41482 case X86ISD::PSHUFD:
41483 // Found another dword shuffle.
41484 break;
41485
41486 case X86ISD::PSHUFLW:
41487 // Check that the low words (being shuffled) are the identity in the
41488 // dword shuffle, and the high words are self-contained.
41489 if (Mask[0] != 0 || Mask[1] != 1 ||
41490 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41491 return SDValue();
41492
41493 Chain.push_back(V);
41494 continue;
41495
41496 case X86ISD::PSHUFHW:
41497 // Check that the high words (being shuffled) are the identity in the
41498 // dword shuffle, and the low words are self-contained.
41499 if (Mask[2] != 2 || Mask[3] != 3 ||
41500 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41501 return SDValue();
41502
41503 Chain.push_back(V);
41504 continue;
41505
41506 case X86ISD::UNPCKL:
41507 case X86ISD::UNPCKH:
41508 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41509 // shuffle into a preceding word shuffle.
41510 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41511 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41512 return SDValue();
41513
41514 // Search for a half-shuffle which we can combine with.
41515 unsigned CombineOp =
41516 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41517 if (V.getOperand(0) != V.getOperand(1) ||
41518 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41519 return SDValue();
41520 Chain.push_back(V);
41521 V = V.getOperand(0);
41522 do {
41523 switch (V.getOpcode()) {
41524 default:
41525 return SDValue(); // Nothing to combine.
41526
41527 case X86ISD::PSHUFLW:
41528 case X86ISD::PSHUFHW:
41529 if (V.getOpcode() == CombineOp)
41530 break;
41531
41532 Chain.push_back(V);
41533
41534 [[fallthrough]];
41535 case ISD::BITCAST:
41536 V = V.getOperand(0);
41537 continue;
41538 }
41539 break;
41540 } while (V.hasOneUse());
41541 break;
41542 }
41543 // Break out of the loop if we break out of the switch.
41544 break;
41545 }
41546
41547 if (!V.hasOneUse())
41548 // We fell out of the loop without finding a viable combining instruction.
41549 return SDValue();
41550
41551 // Merge this node's mask and our incoming mask.
41552 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
41553 for (int &M : Mask)
41554 M = VMask[M];
41555 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41556 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41557
41558 // Rebuild the chain around this new shuffle.
41559 while (!Chain.empty()) {
41560 SDValue W = Chain.pop_back_val();
41561
41562 if (V.getValueType() != W.getOperand(0).getValueType())
41563 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41564
41565 switch (W.getOpcode()) {
41566 default:
41567 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41567)
;
41568
41569 case X86ISD::UNPCKL:
41570 case X86ISD::UNPCKH:
41571 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41572 break;
41573
41574 case X86ISD::PSHUFD:
41575 case X86ISD::PSHUFLW:
41576 case X86ISD::PSHUFHW:
41577 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41578 break;
41579 }
41580 }
41581 if (V.getValueType() != N.getValueType())
41582 V = DAG.getBitcast(N.getValueType(), V);
41583
41584 // Return the new chain to replace N.
41585 return V;
41586}
41587
41588// Attempt to commute shufps LHS loads:
41589// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41590static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
41591 SelectionDAG &DAG) {
41592 // TODO: Add vXf64 support.
41593 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41594 return SDValue();
41595
41596 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41597 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41598 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41599 return SDValue();
41600 SDValue N0 = V.getOperand(0);
41601 SDValue N1 = V.getOperand(1);
41602 unsigned Imm = V.getConstantOperandVal(2);
41603 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41604 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41605 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
41606 return SDValue();
41607 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41608 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41609 DAG.getTargetConstant(Imm, DL, MVT::i8));
41610 };
41611
41612 switch (N.getOpcode()) {
41613 case X86ISD::VPERMILPI:
41614 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41615 unsigned Imm = N.getConstantOperandVal(1);
41616 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41617 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41618 }
41619 break;
41620 case X86ISD::SHUFP: {
41621 SDValue N0 = N.getOperand(0);
41622 SDValue N1 = N.getOperand(1);
41623 unsigned Imm = N.getConstantOperandVal(2);
41624 if (N0 == N1) {
41625 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41626 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41627 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41628 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41629 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41630 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41631 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41632 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41633 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41634 }
41635 break;
41636 }
41637 }
41638
41639 return SDValue();
41640}
41641
41642// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41643static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
41644 const SDLoc &DL) {
41645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41646 EVT ShuffleVT = N.getValueType();
41647
41648 auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
41649 // AllZeros/AllOnes constants are freely shuffled and will peek through
41650 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41651 // merge with target shuffles if it has one use so shuffle combining is
41652 // likely to kick in. Shuffles of splats are expected to be removed.
41653 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41654 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41655 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
41656 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
41657 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41658 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41659 (FoldLoad && isShuffleFoldableLoad(Op)) ||
41660 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41661 };
41662 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41663 // Ensure we only shuffle whole vector src elements, unless its a logical
41664 // binops where we can more aggressively move shuffles from dst to src.
41665 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
41666 BinOp == X86ISD::ANDNP ||
41667 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41668 };
41669
41670 unsigned Opc = N.getOpcode();
41671 switch (Opc) {
41672 // Unary and Unary+Permute Shuffles.
41673 case X86ISD::PSHUFB: {
41674 // Don't merge PSHUFB if it contains zero'd elements.
41675 SmallVector<int> Mask;
41676 SmallVector<SDValue> Ops;
41677 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
41678 Mask))
41679 break;
41680 [[fallthrough]];
41681 }
41682 case X86ISD::VBROADCAST:
41683 case X86ISD::MOVDDUP:
41684 case X86ISD::PSHUFD:
41685 case X86ISD::PSHUFHW:
41686 case X86ISD::PSHUFLW:
41687 case X86ISD::VPERMI:
41688 case X86ISD::VPERMILPI: {
41689 if (N.getOperand(0).getValueType() == ShuffleVT &&
41690 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41691 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41692 unsigned SrcOpcode = N0.getOpcode();
41693 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41694 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41695 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41696 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
41697 IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
41698 SDValue LHS, RHS;
41699 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41700 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41701 if (N.getNumOperands() == 2) {
41702 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41703 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41704 } else {
41705 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41706 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41707 }
41708 EVT OpVT = N0.getValueType();
41709 return DAG.getBitcast(ShuffleVT,
41710 DAG.getNode(SrcOpcode, DL, OpVT,
41711 DAG.getBitcast(OpVT, LHS),
41712 DAG.getBitcast(OpVT, RHS)));
41713 }
41714 }
41715 }
41716 break;
41717 }
41718 // Binary and Binary+Permute Shuffles.
41719 case X86ISD::INSERTPS: {
41720 // Don't merge INSERTPS if it contains zero'd elements.
41721 unsigned InsertPSMask = N.getConstantOperandVal(2);
41722 unsigned ZeroMask = InsertPSMask & 0xF;
41723 if (ZeroMask != 0)
41724 break;
41725 [[fallthrough]];
41726 }
41727 case X86ISD::MOVSD:
41728 case X86ISD::MOVSS:
41729 case X86ISD::BLENDI:
41730 case X86ISD::SHUFP:
41731 case X86ISD::UNPCKH:
41732 case X86ISD::UNPCKL: {
41733 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41734 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41735 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41736 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41737 unsigned SrcOpcode = N0.getOpcode();
41738 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41739 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41740 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41741 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41742 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
41743 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41744 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
41745 // Ensure the total number of shuffles doesn't increase by folding this
41746 // shuffle through to the source ops.
41747 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41748 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41749 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41750 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41751 SDValue LHS, RHS;
41752 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41753 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41754 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41755 Op11 = DAG.getBitcast(ShuffleVT, Op11);
41756 if (N.getNumOperands() == 3) {
41757 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41758 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41759 } else {
41760 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41761 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41762 }
41763 EVT OpVT = N0.getValueType();
41764 return DAG.getBitcast(ShuffleVT,
41765 DAG.getNode(SrcOpcode, DL, OpVT,
41766 DAG.getBitcast(OpVT, LHS),
41767 DAG.getBitcast(OpVT, RHS)));
41768 }
41769 }
41770 }
41771 break;
41772 }
41773 }
41774 return SDValue();
41775}
41776
41777/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41778static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
41779 SelectionDAG &DAG,
41780 const SDLoc &DL) {
41781 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41781, __extension__
__PRETTY_FUNCTION__))
;
41782
41783 MVT VT = V.getSimpleValueType();
41784 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41785 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41786 unsigned SrcOpc0 = Src0.getOpcode();
41787 unsigned SrcOpc1 = Src1.getOpcode();
41788 EVT SrcVT0 = Src0.getValueType();
41789 EVT SrcVT1 = Src1.getValueType();
41790
41791 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41792 return SDValue();
41793
41794 switch (SrcOpc0) {
41795 case X86ISD::MOVDDUP: {
41796 SDValue LHS = Src0.getOperand(0);
41797 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41798 SDValue Res =
41799 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41800 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41801 return DAG.getBitcast(VT, Res);
41802 }
41803 case X86ISD::VPERMILPI:
41804 // TODO: Handle v4f64 permutes with different low/high lane masks.
41805 if (SrcVT0 == MVT::v4f64) {
41806 uint64_t Mask = Src0.getConstantOperandVal(1);
41807 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41808 break;
41809 }
41810 [[fallthrough]];
41811 case X86ISD::VSHLI:
41812 case X86ISD::VSRLI:
41813 case X86ISD::VSRAI:
41814 case X86ISD::PSHUFD:
41815 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41816 SDValue LHS = Src0.getOperand(0);
41817 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41818 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41819 V.getOperand(2));
41820 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41821 return DAG.getBitcast(VT, Res);
41822 }
41823 break;
41824 }
41825
41826 return SDValue();
41827}
41828
41829/// Try to combine x86 target specific shuffles.
41830static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
41831 TargetLowering::DAGCombinerInfo &DCI,
41832 const X86Subtarget &Subtarget) {
41833 SDLoc DL(N);
41834 MVT VT = N.getSimpleValueType();
41835 SmallVector<int, 4> Mask;
41836 unsigned Opcode = N.getOpcode();
41837
41838 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
41839 return R;
41840
41841 // Handle specific target shuffles.
41842 switch (Opcode) {
41843 case X86ISD::MOVDDUP: {
41844 SDValue Src = N.getOperand(0);
41845 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41846 if (VT == MVT::v2f64 && Src.hasOneUse() &&
41847 ISD::isNormalLoad(Src.getNode())) {
41848 LoadSDNode *LN = cast<LoadSDNode>(Src);
41849 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
41850 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
41851 DCI.CombineTo(N.getNode(), Movddup);
41852 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41853 DCI.recursivelyDeleteUnusedNodes(LN);
41854 return N; // Return N so it doesn't get rechecked!
41855 }
41856 }
41857
41858 return SDValue();
41859 }
41860 case X86ISD::VBROADCAST: {
41861 SDValue Src = N.getOperand(0);
41862 SDValue BC = peekThroughBitcasts(Src);
41863 EVT SrcVT = Src.getValueType();
41864 EVT BCVT = BC.getValueType();
41865
41866 // If broadcasting from another shuffle, attempt to simplify it.
41867 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41868 if (isTargetShuffle(BC.getOpcode()) &&
41869 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
41870 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
41871 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
41872 SM_SentinelUndef);
41873 for (unsigned i = 0; i != Scale; ++i)
41874 DemandedMask[i] = i;
41875 if (SDValue Res = combineX86ShufflesRecursively(
41876 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41877 X86::MaxShuffleCombineDepth,
41878 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
41879 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
41880 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41881 DAG.getBitcast(SrcVT, Res));
41882 }
41883
41884 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41885 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41886 if (Src.getOpcode() == ISD::BITCAST &&
41887 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
41888 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
41889 FixedVectorType::isValidElementType(
41890 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
41891 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
41892 VT.getVectorNumElements());
41893 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41894 }
41895
41896 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41897 // If we're re-broadcasting a smaller type then broadcast with that type and
41898 // bitcast.
41899 // TODO: Do this for any splat?
41900 if (Src.getOpcode() == ISD::BITCAST &&
41901 (BC.getOpcode() == X86ISD::VBROADCAST ||
41902 BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
41903 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
41904 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
41905 MVT NewVT =
41906 MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
41907 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
41908 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41909 }
41910
41911 // Reduce broadcast source vector to lowest 128-bits.
41912 if (SrcVT.getSizeInBits() > 128)
41913 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41914 extract128BitVector(Src, 0, DAG, DL));
41915
41916 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41917 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
41918 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41919
41920 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41921 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41922 isNullConstant(Src.getOperand(1)) &&
41923 DAG.getTargetLoweringInfo().isTypeLegal(
41924 Src.getOperand(0).getValueType()))
41925 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41926
41927 // Share broadcast with the longest vector and extract low subvector (free).
41928 // Ensure the same SDValue from the SDNode use is being used.
41929 for (SDNode *User : Src->uses())
41930 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41931 Src == User->getOperand(0) &&
41932 User->getValueSizeInBits(0).getFixedValue() >
41933 VT.getFixedSizeInBits()) {
41934 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
41935 VT.getSizeInBits());
41936 }
41937
41938 // vbroadcast(scalarload X) -> vbroadcast_load X
41939 // For float loads, extract other uses of the scalar from the broadcast.
41940 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
41941 ISD::isNormalLoad(Src.getNode())) {
41942 LoadSDNode *LN = cast<LoadSDNode>(Src);
41943 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41944 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41945 SDValue BcastLd =
41946 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41947 LN->getMemoryVT(), LN->getMemOperand());
41948 // If the load value is used only by N, replace it via CombineTo N.
41949 bool NoReplaceExtract = Src.hasOneUse();
41950 DCI.CombineTo(N.getNode(), BcastLd);
41951 if (NoReplaceExtract) {
41952 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41953 DCI.recursivelyDeleteUnusedNodes(LN);
41954 } else {
41955 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
41956 DAG.getIntPtrConstant(0, DL));
41957 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
41958 }
41959 return N; // Return N so it doesn't get rechecked!
41960 }
41961
41962 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
41963 // i16. So shrink it ourselves if we can make a broadcast_load.
41964 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
41965 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
41966 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41966, __extension__
__PRETTY_FUNCTION__))
;
41967 SDValue TruncIn = Src.getOperand(0);
41968
41969 // If this is a truncate of a non extending load we can just narrow it to
41970 // use a broadcast_load.
41971 if (ISD::isNormalLoad(TruncIn.getNode())) {
41972 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41973 // Unless its volatile or atomic.
41974 if (LN->isSimple()) {
41975 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41976 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41977 SDValue BcastLd = DAG.getMemIntrinsicNode(
41978 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41979 LN->getPointerInfo(), LN->getOriginalAlign(),
41980 LN->getMemOperand()->getFlags());
41981 DCI.CombineTo(N.getNode(), BcastLd);
41982 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41983 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41984 return N; // Return N so it doesn't get rechecked!
41985 }
41986 }
41987
41988 // If this is a truncate of an i16 extload, we can directly replace it.
41989 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41990 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41991 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41992 if (LN->getMemoryVT().getSizeInBits() == 16) {
41993 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41994 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41995 SDValue BcastLd =
41996 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41997 LN->getMemoryVT(), LN->getMemOperand());
41998 DCI.CombineTo(N.getNode(), BcastLd);
41999 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42000 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42001 return N; // Return N so it doesn't get rechecked!
42002 }
42003 }
42004
42005 // If this is a truncate of load that has been shifted right, we can
42006 // offset the pointer and use a narrower load.
42007 if (TruncIn.getOpcode() == ISD::SRL &&
42008 TruncIn.getOperand(0).hasOneUse() &&
42009 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42010 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42011 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42012 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42013 // Make sure the shift amount and the load size are divisible by 16.
42014 // Don't do this if the load is volatile or atomic.
42015 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42016 LN->isSimple()) {
42017 unsigned Offset = ShiftAmt / 8;
42018 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42019 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
42020 TypeSize::Fixed(Offset), DL);
42021 SDValue Ops[] = { LN->getChain(), Ptr };
42022 SDValue BcastLd = DAG.getMemIntrinsicNode(
42023 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42024 LN->getPointerInfo().getWithOffset(Offset),
42025 LN->getOriginalAlign(),
42026 LN->getMemOperand()->getFlags());
42027 DCI.CombineTo(N.getNode(), BcastLd);
42028 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42029 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42030 return N; // Return N so it doesn't get rechecked!
42031 }
42032 }
42033 }
42034
42035 // vbroadcast(vzload X) -> vbroadcast_load X
42036 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42037 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
42038 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42039 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42040 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42041 SDValue BcastLd =
42042 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
42043 LN->getMemoryVT(), LN->getMemOperand());
42044 DCI.CombineTo(N.getNode(), BcastLd);
42045 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42046 DCI.recursivelyDeleteUnusedNodes(LN);
42047 return N; // Return N so it doesn't get rechecked!
42048 }
42049 }
42050
42051 // vbroadcast(vector load X) -> vbroadcast_load
42052 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
42053 SrcVT == MVT::v4i32) &&
42054 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42055 LoadSDNode *LN = cast<LoadSDNode>(Src);
42056 // Unless the load is volatile or atomic.
42057 if (LN->isSimple()) {
42058 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42059 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42060 SDValue BcastLd = DAG.getMemIntrinsicNode(
42061 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42062 LN->getPointerInfo(), LN->getOriginalAlign(),
42063 LN->getMemOperand()->getFlags());
42064 DCI.CombineTo(N.getNode(), BcastLd);
42065 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42066 DCI.recursivelyDeleteUnusedNodes(LN);
42067 return N; // Return N so it doesn't get rechecked!
42068 }
42069 }
42070
42071 return SDValue();
42072 }
42073 case X86ISD::VZEXT_MOVL: {
42074 SDValue N0 = N.getOperand(0);
42075
42076 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42077 // the load is volatile.
42078 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42079 auto *LN = cast<LoadSDNode>(N0);
42080 if (SDValue VZLoad =
42081 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42082 DCI.CombineTo(N.getNode(), VZLoad);
42083 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42084 DCI.recursivelyDeleteUnusedNodes(LN);
42085 return N;
42086 }
42087 }
42088
42089 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42090 // and can just use a VZEXT_LOAD.
42091 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42092 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42093 auto *LN = cast<MemSDNode>(N0);
42094 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42095 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42096 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42097 SDValue VZLoad =
42098 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
42099 LN->getMemoryVT(), LN->getMemOperand());
42100 DCI.CombineTo(N.getNode(), VZLoad);
42101 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42102 DCI.recursivelyDeleteUnusedNodes(LN);
42103 return N;
42104 }
42105 }
42106
42107 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42108 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42109 // if the upper bits of the i64 are zero.
42110 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42111 N0.getOperand(0).hasOneUse() &&
42112 N0.getOperand(0).getValueType() == MVT::i64) {
42113 SDValue In = N0.getOperand(0);
42114 APInt Mask = APInt::getHighBitsSet(64, 32);
42115 if (DAG.MaskedValueIsZero(In, Mask)) {
42116 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42117 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
42118 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42119 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42120 return DAG.getBitcast(VT, Movl);
42121 }
42122 }
42123
42124 // Load a scalar integer constant directly to XMM instead of transferring an
42125 // immediate value from GPR.
42126 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42127 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42128 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42129 // Create a vector constant - scalar constant followed by zeros.
42130 EVT ScalarVT = N0.getOperand(0).getValueType();
42131 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42132 unsigned NumElts = VT.getVectorNumElements();
42133 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42134 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42135 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42136
42137 // Load the vector constant from constant pool.
42138 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
42139 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42140 MachinePointerInfo MPI =
42141 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
42142 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42143 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42144 MachineMemOperand::MOLoad);
42145 }
42146 }
42147
42148 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42149 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42150 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42151 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42152 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42153 SDValue V = peekThroughOneUseBitcasts(N0);
42154
42155 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42156 isNullConstant(V.getOperand(2))) {
42157 SDValue In = V.getOperand(1);
42158 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
42159 In.getValueSizeInBits() /
42160 VT.getScalarSizeInBits());
42161 In = DAG.getBitcast(SubVT, In);
42162 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42163 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42164 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42165 V.getOperand(2));
42166 }
42167 }
42168
42169 return SDValue();
42170 }
42171 case X86ISD::BLENDI: {
42172 SDValue N0 = N.getOperand(0);
42173 SDValue N1 = N.getOperand(1);
42174
42175 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42176 // TODO: Handle MVT::v16i16 repeated blend mask.
42177 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
42178 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42179 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42180 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
42181 SrcVT.getScalarSizeInBits() >= 32) {
42182 unsigned BlendMask = N.getConstantOperandVal(2);
42183 unsigned Size = VT.getVectorNumElements();
42184 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
42185 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
42186 return DAG.getBitcast(
42187 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42188 N1.getOperand(0),
42189 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
42190 }
42191 }
42192 return SDValue();
42193 }
42194 case X86ISD::SHUFP: {
42195 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42196 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42197 // TODO: Support types other than v4f32.
42198 if (VT == MVT::v4f32) {
42199 bool Updated = false;
42200 SmallVector<int> Mask;
42201 SmallVector<SDValue> Ops;
42202 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
42203 Ops.size() == 2) {
42204 for (int i = 0; i != 2; ++i) {
42205 SmallVector<SDValue> SubOps;
42206 SmallVector<int> SubMask, SubScaledMask;
42207 SDValue Sub = peekThroughBitcasts(Ops[i]);
42208 // TODO: Scaling might be easier if we specify the demanded elts.
42209 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42210 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42211 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42212 int Ofs = i * 2;
42213 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42214 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42215 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42216 Updated = true;
42217 }
42218 }
42219 }
42220 if (Updated) {
42221 for (int &M : Mask)
42222 M %= 4;
42223 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42224 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42225 }
42226 }
42227 return SDValue();
42228 }
42229 case X86ISD::VPERMI: {
42230 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42231 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42232 SDValue N0 = N.getOperand(0);
42233 SDValue N1 = N.getOperand(1);
42234 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42235 if (N0.getOpcode() == ISD::BITCAST &&
42236 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42237 SDValue Src = N0.getOperand(0);
42238 EVT SrcVT = Src.getValueType();
42239 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42240 return DAG.getBitcast(VT, Res);
42241 }
42242 return SDValue();
42243 }
42244 case X86ISD::VPERM2X128: {
42245 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42246 SDValue LHS = N->getOperand(0);
42247 SDValue RHS = N->getOperand(1);
42248 if (LHS.getOpcode() == ISD::BITCAST &&
42249 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42250 EVT SrcVT = LHS.getOperand(0).getValueType();
42251 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42252 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42253 DAG.getBitcast(SrcVT, LHS),
42254 DAG.getBitcast(SrcVT, RHS),
42255 N->getOperand(2)));
42256 }
42257 }
42258
42259 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42260 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
42261 return Res;
42262
42263 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42264 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42265 auto FindSubVector128 = [&](unsigned Idx) {
42266 if (Idx > 3)
42267 return SDValue();
42268 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42269 SmallVector<SDValue> SubOps;
42270 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42271 return SubOps[Idx & 1];
42272 unsigned NumElts = Src.getValueType().getVectorNumElements();
42273 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42274 Src.getOperand(1).getValueSizeInBits() == 128 &&
42275 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42276 return Src.getOperand(1);
42277 }
42278 return SDValue();
42279 };
42280 unsigned Imm = N.getConstantOperandVal(2);
42281 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42282 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42283 MVT SubVT = VT.getHalfNumVectorElementsVT();
42284 SubLo = DAG.getBitcast(SubVT, SubLo);
42285 SubHi = DAG.getBitcast(SubVT, SubHi);
42286 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42287 }
42288 }
42289 return SDValue();
42290 }
42291 case X86ISD::PSHUFD:
42292 case X86ISD::PSHUFLW:
42293 case X86ISD::PSHUFHW: {
42294 SDValue N0 = N.getOperand(0);
42295 SDValue N1 = N.getOperand(1);
42296 if (N0->hasOneUse()) {
42297 SDValue V = peekThroughOneUseBitcasts(N0);
42298 switch (V.getOpcode()) {
42299 case X86ISD::VSHL:
42300 case X86ISD::VSRL:
42301 case X86ISD::VSRA:
42302 case X86ISD::VSHLI:
42303 case X86ISD::VSRLI:
42304 case X86ISD::VSRAI:
42305 case X86ISD::VROTLI:
42306 case X86ISD::VROTRI: {
42307 MVT InnerVT = V.getSimpleValueType();
42308 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42309 SDValue Res = DAG.getNode(Opcode, DL, VT,
42310 DAG.getBitcast(VT, V.getOperand(0)), N1);
42311 Res = DAG.getBitcast(InnerVT, Res);
42312 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42313 return DAG.getBitcast(VT, Res);
42314 }
42315 break;
42316 }
42317 }
42318 }
42319
42320 Mask = getPSHUFShuffleMask(N);
42321 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42321, __extension__ __PRETTY_FUNCTION__))
;
42322 break;
42323 }
42324 case X86ISD::MOVSD:
42325 case X86ISD::MOVSH:
42326 case X86ISD::MOVSS: {
42327 SDValue N0 = N.getOperand(0);
42328 SDValue N1 = N.getOperand(1);
42329
42330 // Canonicalize scalar FPOps:
42331 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42332 // If commutable, allow OP(N1[0], N0[0]).
42333 unsigned Opcode1 = N1.getOpcode();
42334 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42335 Opcode1 == ISD::FDIV) {
42336 SDValue N10 = N1.getOperand(0);
42337 SDValue N11 = N1.getOperand(1);
42338 if (N10 == N0 ||
42339 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42340 if (N10 != N0)
42341 std::swap(N10, N11);
42342 MVT SVT = VT.getVectorElementType();
42343 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
42344 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42345 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42346 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42347 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42348 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42349 }
42350 }
42351
42352 return SDValue();
42353 }
42354 case X86ISD::INSERTPS: {
42355 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42355, __extension__
__PRETTY_FUNCTION__))
;
42356 SDValue Op0 = N.getOperand(0);
42357 SDValue Op1 = N.getOperand(1);
42358 unsigned InsertPSMask = N.getConstantOperandVal(2);
42359 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42360 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42361 unsigned ZeroMask = InsertPSMask & 0xF;
42362
42363 // If we zero out all elements from Op0 then we don't need to reference it.
42364 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42365 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42366 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42367
42368 // If we zero out the element from Op1 then we don't need to reference it.
42369 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42370 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42371 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42372
42373 // Attempt to merge insertps Op1 with an inner target shuffle node.
42374 SmallVector<int, 8> TargetMask1;
42375 SmallVector<SDValue, 2> Ops1;
42376 APInt KnownUndef1, KnownZero1;
42377 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42378 KnownZero1)) {
42379 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42380 // Zero/UNDEF insertion - zero out element and remove dependency.
42381 InsertPSMask |= (1u << DstIdx);
42382 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42383 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42384 }
42385 // Update insertps mask srcidx and reference the source input directly.
42386 int M = TargetMask1[SrcIdx];
42387 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42387, __extension__
__PRETTY_FUNCTION__))
;
42388 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42389 Op1 = Ops1[M < 4 ? 0 : 1];
42390 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42391 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42392 }
42393
42394 // Attempt to merge insertps Op0 with an inner target shuffle node.
42395 SmallVector<int, 8> TargetMask0;
42396 SmallVector<SDValue, 2> Ops0;
42397 APInt KnownUndef0, KnownZero0;
42398 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42399 KnownZero0)) {
42400 bool Updated = false;
42401 bool UseInput00 = false;
42402 bool UseInput01 = false;
42403 for (int i = 0; i != 4; ++i) {
42404 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42405 // No change if element is already zero or the inserted element.
42406 continue;
42407 }
42408
42409 if (KnownUndef0[i] || KnownZero0[i]) {
42410 // If the target mask is undef/zero then we must zero the element.
42411 InsertPSMask |= (1u << i);
42412 Updated = true;
42413 continue;
42414 }
42415
42416 // The input vector element must be inline.
42417 int M = TargetMask0[i];
42418 if (M != i && M != (i + 4))
42419 return SDValue();
42420
42421 // Determine which inputs of the target shuffle we're using.
42422 UseInput00 |= (0 <= M && M < 4);
42423 UseInput01 |= (4 <= M);
42424 }
42425
42426 // If we're not using both inputs of the target shuffle then use the
42427 // referenced input directly.
42428 if (UseInput00 && !UseInput01) {
42429 Updated = true;
42430 Op0 = Ops0[0];
42431 } else if (!UseInput00 && UseInput01) {
42432 Updated = true;
42433 Op0 = Ops0[1];
42434 }
42435
42436 if (Updated)
42437 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42438 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42439 }
42440
42441 // If we're inserting an element from a vbroadcast load, fold the
42442 // load into the X86insertps instruction. We need to convert the scalar
42443 // load to a vector and clear the source lane of the INSERTPS control.
42444 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42445 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42446 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42447 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42448 MemIntr->getBasePtr(),
42449 MemIntr->getMemOperand());
42450 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42451 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
42452 Load),
42453 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42454 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42455 return Insert;
42456 }
42457 }
42458
42459 return SDValue();
42460 }
42461 default:
42462 return SDValue();
42463 }
42464
42465 // Nuke no-op shuffles that show up after combining.
42466 if (isNoopShuffleMask(Mask))
42467 return N.getOperand(0);
42468
42469 // Look for simplifications involving one or two shuffle instructions.
42470 SDValue V = N.getOperand(0);
42471 switch (N.getOpcode()) {
42472 default:
42473 break;
42474 case X86ISD::PSHUFLW:
42475 case X86ISD::PSHUFHW:
42476 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42476, __extension__
__PRETTY_FUNCTION__))
;
42477
42478 // See if this reduces to a PSHUFD which is no more expensive and can
42479 // combine with more operations. Note that it has to at least flip the
42480 // dwords as otherwise it would have been removed as a no-op.
42481 if (ArrayRef(Mask).equals({2, 3, 0, 1})) {
42482 int DMask[] = {0, 1, 2, 3};
42483 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42484 DMask[DOffset + 0] = DOffset + 1;
42485 DMask[DOffset + 1] = DOffset + 0;
42486 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
42487 V = DAG.getBitcast(DVT, V);
42488 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42489 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42490 return DAG.getBitcast(VT, V);
42491 }
42492
42493 // Look for shuffle patterns which can be implemented as a single unpack.
42494 // FIXME: This doesn't handle the location of the PSHUFD generically, and
42495 // only works when we have a PSHUFD followed by two half-shuffles.
42496 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42497 (V.getOpcode() == X86ISD::PSHUFLW ||
42498 V.getOpcode() == X86ISD::PSHUFHW) &&
42499 V.getOpcode() != N.getOpcode() &&
42500 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42501 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42502 if (D.getOpcode() == X86ISD::PSHUFD) {
42503 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
42504 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
42505 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42506 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42507 int WordMask[8];
42508 for (int i = 0; i < 4; ++i) {
42509 WordMask[i + NOffset] = Mask[i] + NOffset;
42510 WordMask[i + VOffset] = VMask[i] + VOffset;
42511 }
42512 // Map the word mask through the DWord mask.
42513 int MappedMask[8];
42514 for (int i = 0; i < 8; ++i)
42515 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42516 if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42517 ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42518 // We can replace all three shuffles with an unpack.
42519 V = DAG.getBitcast(VT, D.getOperand(0));
42520 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42521 : X86ISD::UNPCKH,
42522 DL, VT, V, V);
42523 }
42524 }
42525 }
42526
42527 break;
42528
42529 case X86ISD::PSHUFD:
42530 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
42531 return NewN;
42532
42533 break;
42534 }
42535
42536 return SDValue();
42537}
42538
42539/// Checks if the shuffle mask takes subsequent elements
42540/// alternately from two vectors.
42541/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42542static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42543
42544 int ParitySrc[2] = {-1, -1};
42545 unsigned Size = Mask.size();
42546 for (unsigned i = 0; i != Size; ++i) {
42547 int M = Mask[i];
42548 if (M < 0)
42549 continue;
42550
42551 // Make sure we are using the matching element from the input.
42552 if ((M % Size) != i)
42553 return false;
42554
42555 // Make sure we use the same input for all elements of the same parity.
42556 int Src = M / Size;
42557 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42558 return false;
42559 ParitySrc[i % 2] = Src;
42560 }
42561
42562 // Make sure each input is used.
42563 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42564 return false;
42565
42566 Op0Even = ParitySrc[0] == 0;
42567 return true;
42568}
42569
42570/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42571/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42572/// are written to the parameters \p Opnd0 and \p Opnd1.
42573///
42574/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42575/// so it is easier to generically match. We also insert dummy vector shuffle
42576/// nodes for the operands which explicitly discard the lanes which are unused
42577/// by this operation to try to flow through the rest of the combiner
42578/// the fact that they're unused.
42579static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42580 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42581 bool &IsSubAdd) {
42582
42583 EVT VT = N->getValueType(0);
42584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42585 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42586 !VT.getSimpleVT().isFloatingPoint())
42587 return false;
42588
42589 // We only handle target-independent shuffles.
42590 // FIXME: It would be easy and harmless to use the target shuffle mask
42591 // extraction tool to support more.
42592 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42593 return false;
42594
42595 SDValue V1 = N->getOperand(0);
42596 SDValue V2 = N->getOperand(1);
42597
42598 // Make sure we have an FADD and an FSUB.
42599 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42600 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42601 V1.getOpcode() == V2.getOpcode())
42602 return false;
42603
42604 // If there are other uses of these operations we can't fold them.
42605 if (!V1->hasOneUse() || !V2->hasOneUse())
42606 return false;
42607
42608 // Ensure that both operations have the same operands. Note that we can
42609 // commute the FADD operands.
42610 SDValue LHS, RHS;
42611 if (V1.getOpcode() == ISD::FSUB) {
42612 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42613 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42614 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42615 return false;
42616 } else {
42617 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42617, __extension__
__PRETTY_FUNCTION__))
;
42618 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42619 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42620 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42621 return false;
42622 }
42623
42624 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42625 bool Op0Even;
42626 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42627 return false;
42628
42629 // It's a subadd if the vector in the even parity is an FADD.
42630 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42631 : V2->getOpcode() == ISD::FADD;
42632
42633 Opnd0 = LHS;
42634 Opnd1 = RHS;
42635 return true;
42636}
42637
42638/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42639static SDValue combineShuffleToFMAddSub(SDNode *N,
42640 const X86Subtarget &Subtarget,
42641 SelectionDAG &DAG) {
42642 // We only handle target-independent shuffles.
42643 // FIXME: It would be easy and harmless to use the target shuffle mask
42644 // extraction tool to support more.
42645 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42646 return SDValue();
42647
42648 MVT VT = N->getSimpleValueType(0);
42649 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42650 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42651 return SDValue();
42652
42653 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42654 SDValue Op0 = N->getOperand(0);
42655 SDValue Op1 = N->getOperand(1);
42656 SDValue FMAdd = Op0, FMSub = Op1;
42657 if (FMSub.getOpcode() != X86ISD::FMSUB)
42658 std::swap(FMAdd, FMSub);
42659
42660 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42661 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42662 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42663 FMAdd.getOperand(2) != FMSub.getOperand(2))
42664 return SDValue();
42665
42666 // Check for correct shuffle mask.
42667 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42668 bool Op0Even;
42669 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42670 return SDValue();
42671
42672 // FMAddSub takes zeroth operand from FMSub node.
42673 SDLoc DL(N);
42674 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42675 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42676 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42677 FMAdd.getOperand(2));
42678}
42679
42680/// Try to combine a shuffle into a target-specific add-sub or
42681/// mul-add-sub node.
42682static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
42683 const X86Subtarget &Subtarget,
42684 SelectionDAG &DAG) {
42685 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
42686 return V;
42687
42688 SDValue Opnd0, Opnd1;
42689 bool IsSubAdd;
42690 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42691 return SDValue();
42692
42693 MVT VT = N->getSimpleValueType(0);
42694 SDLoc DL(N);
42695
42696 // Try to generate X86ISD::FMADDSUB node here.
42697 SDValue Opnd2;
42698 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42699 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42700 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42701 }
42702
42703 if (IsSubAdd)
42704 return SDValue();
42705
42706 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42707 // the ADDSUB idiom has been successfully recognized. There are no known
42708 // X86 targets with 512-bit ADDSUB instructions!
42709 if (VT.is512BitVector())
42710 return SDValue();
42711
42712 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42713 // the ADDSUB idiom has been successfully recognized. There are no known
42714 // X86 targets with FP16 ADDSUB instructions!
42715 if (VT.getVectorElementType() == MVT::f16)
42716 return SDValue();
42717
42718 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42719}
42720
42721// We are looking for a shuffle where both sources are concatenated with undef
42722// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42723// if we can express this as a single-source shuffle, that's preferable.
42724static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
42725 const X86Subtarget &Subtarget) {
42726 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42727 return SDValue();
42728
42729 EVT VT = N->getValueType(0);
42730
42731 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42732 if (!VT.is128BitVector() && !VT.is256BitVector())
42733 return SDValue();
42734
42735 if (VT.getVectorElementType() != MVT::i32 &&
42736 VT.getVectorElementType() != MVT::i64 &&
42737 VT.getVectorElementType() != MVT::f32 &&
42738 VT.getVectorElementType() != MVT::f64)
42739 return SDValue();
42740
42741 SDValue N0 = N->getOperand(0);
42742 SDValue N1 = N->getOperand(1);
42743
42744 // Check that both sources are concats with undef.
42745 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42746 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42747 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42748 !N1.getOperand(1).isUndef())
42749 return SDValue();
42750
42751 // Construct the new shuffle mask. Elements from the first source retain their
42752 // index, but elements from the second source no longer need to skip an undef.
42753 SmallVector<int, 8> Mask;
42754 int NumElts = VT.getVectorNumElements();
42755
42756 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
42757 for (int Elt : SVOp->getMask())
42758 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42759
42760 SDLoc DL(N);
42761 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
42762 N1.getOperand(0));
42763 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42764}
42765
42766/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42767/// low half of each source vector and does not set any high half elements in
42768/// the destination vector, narrow the shuffle to half its original size.
42769static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
42770 if (!Shuf->getValueType(0).isSimple())
42771 return SDValue();
42772 MVT VT = Shuf->getSimpleValueType(0);
42773 if (!VT.is256BitVector() && !VT.is512BitVector())
42774 return SDValue();
42775
42776 // See if we can ignore all of the high elements of the shuffle.
42777 ArrayRef<int> Mask = Shuf->getMask();
42778 if (!isUndefUpperHalf(Mask))
42779 return SDValue();
42780
42781 // Check if the shuffle mask accesses only the low half of each input vector
42782 // (half-index output is 0 or 2).
42783 int HalfIdx1, HalfIdx2;
42784 SmallVector<int, 8> HalfMask(Mask.size() / 2);
42785 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42786 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42787 return SDValue();
42788
42789 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42790 // The trick is knowing that all of the insert/extract are actually free
42791 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42792 // of narrow inputs into a narrow output, and that is always cheaper than
42793 // the wide shuffle that we started with.
42794 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42795 Shuf->getOperand(1), HalfMask, HalfIdx1,
42796 HalfIdx2, false, DAG, /*UseConcat*/true);
42797}
42798
42799static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
42800 TargetLowering::DAGCombinerInfo &DCI,
42801 const X86Subtarget &Subtarget) {
42802 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42803 if (SDValue V = narrowShuffle(Shuf, DAG))
42804 return V;
42805
42806 // If we have legalized the vector types, look for blends of FADD and FSUB
42807 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42808 SDLoc dl(N);
42809 EVT VT = N->getValueType(0);
42810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42811 if (TLI.isTypeLegal(VT))
42812 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
42813 return AddSub;
42814
42815 // Attempt to combine into a vector load/broadcast.
42816 if (SDValue LD = combineToConsecutiveLoads(
42817 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42818 return LD;
42819
42820 // For AVX2, we sometimes want to combine
42821 // (vector_shuffle <mask> (concat_vectors t1, undef)
42822 // (concat_vectors t2, undef))
42823 // Into:
42824 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42825 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42826 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
42827 return ShufConcat;
42828
42829 if (isTargetShuffle(N->getOpcode())) {
42830 SDValue Op(N, 0);
42831 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
42832 return Shuffle;
42833
42834 // Try recursively combining arbitrary sequences of x86 shuffle
42835 // instructions into higher-order shuffles. We do this after combining
42836 // specific PSHUF instruction sequences into their minimal form so that we
42837 // can evaluate how many specialized shuffle instructions are involved in
42838 // a particular chain.
42839 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42840 return Res;
42841
42842 // Simplify source operands based on shuffle mask.
42843 // TODO - merge this into combineX86ShufflesRecursively.
42844 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
42845 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
42846 return SDValue(N, 0);
42847
42848 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42849 // Perform this after other shuffle combines to allow inner shuffles to be
42850 // combined away first.
42851 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
42852 return BinOp;
42853 }
42854
42855 return SDValue();
42856}
42857
42858// Simplify variable target shuffle masks based on the demanded elements.
42859// TODO: Handle DemandedBits in mask indices as well?
42860bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
42861 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
42862 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42863 // If we're demanding all elements don't bother trying to simplify the mask.
42864 unsigned NumElts = DemandedElts.getBitWidth();
42865 if (DemandedElts.isAllOnes())
42866 return false;
42867
42868 SDValue Mask = Op.getOperand(MaskIndex);
42869 if (!Mask.hasOneUse())
42870 return false;
42871
42872 // Attempt to generically simplify the variable shuffle mask.
42873 APInt MaskUndef, MaskZero;
42874 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
42875 Depth + 1))
42876 return true;
42877
42878 // Attempt to extract+simplify a (constant pool load) shuffle mask.
42879 // TODO: Support other types from getTargetShuffleMaskIndices?
42880 SDValue BC = peekThroughOneUseBitcasts(Mask);
42881 EVT BCVT = BC.getValueType();
42882 auto *Load = dyn_cast<LoadSDNode>(BC);
42883 if (!Load)
42884 return false;
42885
42886 const Constant *C = getTargetConstantFromNode(Load);
42887 if (!C)
42888 return false;
42889
42890 Type *CTy = C->getType();
42891 if (!CTy->isVectorTy() ||
42892 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42893 return false;
42894
42895 // Handle scaling for i64 elements on 32-bit targets.
42896 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42897 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
42898 return false;
42899 unsigned Scale = NumCstElts / NumElts;
42900
42901 // Simplify mask if we have an undemanded element that is not undef.
42902 bool Simplified = false;
42903 SmallVector<Constant *, 32> ConstVecOps;
42904 for (unsigned i = 0; i != NumCstElts; ++i) {
42905 Constant *Elt = C->getAggregateElement(i);
42906 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
42907 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42908 Simplified = true;
42909 continue;
42910 }
42911 ConstVecOps.push_back(Elt);
42912 }
42913 if (!Simplified)
42914 return false;
42915
42916 // Generate new constant pool entry + legalize immediately for the load.
42917 SDLoc DL(Op);
42918 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
42919 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
42920 SDValue NewMask = TLO.DAG.getLoad(
42921 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
42922 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
42923 Load->getAlign());
42924 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
42925}
42926
42927bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
42928 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
42929 TargetLoweringOpt &TLO, unsigned Depth) const {
42930 int NumElts = DemandedElts.getBitWidth();
42931 unsigned Opc = Op.getOpcode();
42932 EVT VT = Op.getValueType();
42933
42934 // Handle special case opcodes.
42935 switch (Opc) {
42936 case X86ISD::PMULDQ:
42937 case X86ISD::PMULUDQ: {
42938 APInt LHSUndef, LHSZero;
42939 APInt RHSUndef, RHSZero;
42940 SDValue LHS = Op.getOperand(0);
42941 SDValue RHS = Op.getOperand(1);
42942 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42943 Depth + 1))
42944 return true;
42945 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42946 Depth + 1))
42947 return true;
42948 // Multiply by zero.
42949 KnownZero = LHSZero | RHSZero;
42950 break;
42951 }
42952 case X86ISD::VPMADDWD: {
42953 APInt LHSUndef, LHSZero;
42954 APInt RHSUndef, RHSZero;
42955 SDValue LHS = Op.getOperand(0);
42956 SDValue RHS = Op.getOperand(1);
42957 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
42958
42959 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
42960 Depth + 1))
42961 return true;
42962 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
42963 Depth + 1))
42964 return true;
42965
42966 // TODO: Multiply by zero.
42967
42968 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
42969 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
42970 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
42971 Depth + 1))
42972 return true;
42973 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
42974 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
42975 Depth + 1))
42976 return true;
42977 break;
42978 }
42979 case X86ISD::PSADBW: {
42980 SDValue LHS = Op.getOperand(0);
42981 SDValue RHS = Op.getOperand(1);
42982 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
42983 LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
42984 LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
42985 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
;
42986
42987 // Aggressively peek through ops to get at the demanded elts.
42988 if (!DemandedElts.isAllOnes()) {
42989 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
42990 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
42991 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
42992 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42993 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
42994 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42995 if (NewLHS || NewRHS) {
42996 NewLHS = NewLHS ? NewLHS : LHS;
42997 NewRHS = NewRHS ? NewRHS : RHS;
42998 return TLO.CombineTo(
42999 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43000 }
43001 }
43002 break;
43003 }
43004 case X86ISD::VSHL:
43005 case X86ISD::VSRL:
43006 case X86ISD::VSRA: {
43007 // We only need the bottom 64-bits of the (128-bit) shift amount.
43008 SDValue Amt = Op.getOperand(1);
43009 MVT AmtVT = Amt.getSimpleValueType();
43010 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43010, __extension__
__PRETTY_FUNCTION__))
;
43011
43012 // If we reuse the shift amount just for sse shift amounts then we know that
43013 // only the bottom 64-bits are only ever used.
43014 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
43015 unsigned UseOpc = Use->getOpcode();
43016 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43017 UseOpc == X86ISD::VSRA) &&
43018 Use->getOperand(0) != Amt;
43019 });
43020
43021 APInt AmtUndef, AmtZero;
43022 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43023 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43024 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43025 Depth + 1, AssumeSingleUse))
43026 return true;
43027 [[fallthrough]];
43028 }
43029 case X86ISD::VSHLI:
43030 case X86ISD::VSRLI:
43031 case X86ISD::VSRAI: {
43032 SDValue Src = Op.getOperand(0);
43033 APInt SrcUndef;
43034 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43035 Depth + 1))
43036 return true;
43037
43038 // Fold shift(0,x) -> 0
43039 if (DemandedElts.isSubsetOf(KnownZero))
43040 return TLO.CombineTo(
43041 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43042
43043 // Aggressively peek through ops to get at the demanded elts.
43044 if (!DemandedElts.isAllOnes())
43045 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43046 Src, DemandedElts, TLO.DAG, Depth + 1))
43047 return TLO.CombineTo(
43048 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43049 break;
43050 }
43051 case X86ISD::VPSHA:
43052 case X86ISD::VPSHL:
43053 case X86ISD::VSHLV:
43054 case X86ISD::VSRLV:
43055 case X86ISD::VSRAV: {
43056 APInt LHSUndef, LHSZero;
43057 APInt RHSUndef, RHSZero;
43058 SDValue LHS = Op.getOperand(0);
43059 SDValue RHS = Op.getOperand(1);
43060 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43061 Depth + 1))
43062 return true;
43063
43064 // Fold shift(0,x) -> 0
43065 if (DemandedElts.isSubsetOf(LHSZero))
43066 return TLO.CombineTo(
43067 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43068
43069 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43070 Depth + 1))
43071 return true;
43072
43073 KnownZero = LHSZero;
43074 break;
43075 }
43076 case X86ISD::KSHIFTL: {
43077 SDValue Src = Op.getOperand(0);
43078 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43079 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43079, __extension__
__PRETTY_FUNCTION__))
;
43080 unsigned ShiftAmt = Amt->getZExtValue();
43081
43082 if (ShiftAmt == 0)
43083 return TLO.CombineTo(Op, Src);
43084
43085 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43086 // single shift. We can do this if the bottom bits (which are shifted
43087 // out) are never demanded.
43088 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43089 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43090 unsigned C1 = Src.getConstantOperandVal(1);
43091 unsigned NewOpc = X86ISD::KSHIFTL;
43092 int Diff = ShiftAmt - C1;
43093 if (Diff < 0) {
43094 Diff = -Diff;
43095 NewOpc = X86ISD::KSHIFTR;
43096 }
43097
43098 SDLoc dl(Op);
43099 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43100 return TLO.CombineTo(
43101 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43102 }
43103 }
43104
43105 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43106 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43107 Depth + 1))
43108 return true;
43109
43110 KnownUndef <<= ShiftAmt;
43111 KnownZero <<= ShiftAmt;
43112 KnownZero.setLowBits(ShiftAmt);
43113 break;
43114 }
43115 case X86ISD::KSHIFTR: {
43116 SDValue Src = Op.getOperand(0);
43117 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43118 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43118, __extension__
__PRETTY_FUNCTION__))
;
43119 unsigned ShiftAmt = Amt->getZExtValue();
43120
43121 if (ShiftAmt == 0)
43122 return TLO.CombineTo(Op, Src);
43123
43124 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43125 // single shift. We can do this if the top bits (which are shifted
43126 // out) are never demanded.
43127 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43128 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43129 unsigned C1 = Src.getConstantOperandVal(1);
43130 unsigned NewOpc = X86ISD::KSHIFTR;
43131 int Diff = ShiftAmt - C1;
43132 if (Diff < 0) {
43133 Diff = -Diff;
43134 NewOpc = X86ISD::KSHIFTL;
43135 }
43136
43137 SDLoc dl(Op);
43138 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43139 return TLO.CombineTo(
43140 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43141 }
43142 }
43143
43144 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43145 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43146 Depth + 1))
43147 return true;
43148
43149 KnownUndef.lshrInPlace(ShiftAmt);
43150 KnownZero.lshrInPlace(ShiftAmt);
43151 KnownZero.setHighBits(ShiftAmt);
43152 break;
43153 }
43154 case X86ISD::ANDNP: {
43155 // ANDNP = (~LHS & RHS);
43156 SDValue LHS = Op.getOperand(0);
43157 SDValue RHS = Op.getOperand(1);
43158
43159 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43160 APInt UndefElts;
43161 SmallVector<APInt> EltBits;
43162 int NumElts = VT.getVectorNumElements();
43163 int EltSizeInBits = VT.getScalarSizeInBits();
43164 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43165 APInt OpElts = DemandedElts;
43166 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43167 EltBits)) {
43168 OpBits.clearAllBits();
43169 OpElts.clearAllBits();
43170 for (int I = 0; I != NumElts; ++I) {
43171 if (!DemandedElts[I])
43172 continue;
43173 if (UndefElts[I]) {
43174 // We can't assume an undef src element gives an undef dst - the
43175 // other src might be zero.
43176 OpBits.setAllBits();
43177 OpElts.setBit(I);
43178 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43179 (!Invert && !EltBits[I].isZero())) {
43180 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43181 OpElts.setBit(I);
43182 }
43183 }
43184 }
43185 return std::make_pair(OpBits, OpElts);
43186 };
43187 APInt BitsLHS, EltsLHS;
43188 APInt BitsRHS, EltsRHS;
43189 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43190 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43191
43192 APInt LHSUndef, LHSZero;
43193 APInt RHSUndef, RHSZero;
43194 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43195 Depth + 1))
43196 return true;
43197 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43198 Depth + 1))
43199 return true;
43200
43201 if (!DemandedElts.isAllOnes()) {
43202 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43203 TLO.DAG, Depth + 1);
43204 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43205 TLO.DAG, Depth + 1);
43206 if (NewLHS || NewRHS) {
43207 NewLHS = NewLHS ? NewLHS : LHS;
43208 NewRHS = NewRHS ? NewRHS : RHS;
43209 return TLO.CombineTo(
43210 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43211 }
43212 }
43213 break;
43214 }
43215 case X86ISD::CVTSI2P:
43216 case X86ISD::CVTUI2P: {
43217 SDValue Src = Op.getOperand(0);
43218 MVT SrcVT = Src.getSimpleValueType();
43219 APInt SrcUndef, SrcZero;
43220 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43221 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43222 Depth + 1))
43223 return true;
43224 break;
43225 }
43226 case X86ISD::PACKSS:
43227 case X86ISD::PACKUS: {
43228 SDValue N0 = Op.getOperand(0);
43229 SDValue N1 = Op.getOperand(1);
43230
43231 APInt DemandedLHS, DemandedRHS;
43232 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43233
43234 APInt LHSUndef, LHSZero;
43235 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43236 Depth + 1))
43237 return true;
43238 APInt RHSUndef, RHSZero;
43239 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43240 Depth + 1))
43241 return true;
43242
43243 // TODO - pass on known zero/undef.
43244
43245 // Aggressively peek through ops to get at the demanded elts.
43246 // TODO - we should do this for all target/faux shuffles ops.
43247 if (!DemandedElts.isAllOnes()) {
43248 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43249 TLO.DAG, Depth + 1);
43250 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43251 TLO.DAG, Depth + 1);
43252 if (NewN0 || NewN1) {
43253 NewN0 = NewN0 ? NewN0 : N0;
43254 NewN1 = NewN1 ? NewN1 : N1;
43255 return TLO.CombineTo(Op,
43256 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43257 }
43258 }
43259 break;
43260 }
43261 case X86ISD::HADD:
43262 case X86ISD::HSUB:
43263 case X86ISD::FHADD:
43264 case X86ISD::FHSUB: {
43265 SDValue N0 = Op.getOperand(0);
43266 SDValue N1 = Op.getOperand(1);
43267
43268 APInt DemandedLHS, DemandedRHS;
43269 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43270
43271 APInt LHSUndef, LHSZero;
43272 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43273 Depth + 1))
43274 return true;
43275 APInt RHSUndef, RHSZero;
43276 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43277 Depth + 1))
43278 return true;
43279
43280 // TODO - pass on known zero/undef.
43281
43282 // Aggressively peek through ops to get at the demanded elts.
43283 // TODO: Handle repeated operands.
43284 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43285 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43286 TLO.DAG, Depth + 1);
43287 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43288 TLO.DAG, Depth + 1);
43289 if (NewN0 || NewN1) {
43290 NewN0 = NewN0 ? NewN0 : N0;
43291 NewN1 = NewN1 ? NewN1 : N1;
43292 return TLO.CombineTo(Op,
43293 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43294 }
43295 }
43296 break;
43297 }
43298 case X86ISD::VTRUNC:
43299 case X86ISD::VTRUNCS:
43300 case X86ISD::VTRUNCUS: {
43301 SDValue Src = Op.getOperand(0);
43302 MVT SrcVT = Src.getSimpleValueType();
43303 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43304 APInt SrcUndef, SrcZero;
43305 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43306 Depth + 1))
43307 return true;
43308 KnownZero = SrcZero.zextOrTrunc(NumElts);
43309 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43310 break;
43311 }
43312 case X86ISD::BLENDV: {
43313 APInt SelUndef, SelZero;
43314 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43315 SelZero, TLO, Depth + 1))
43316 return true;
43317
43318 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43319 APInt LHSUndef, LHSZero;
43320 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43321 LHSZero, TLO, Depth + 1))
43322 return true;
43323
43324 APInt RHSUndef, RHSZero;
43325 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43326 RHSZero, TLO, Depth + 1))
43327 return true;
43328
43329 KnownZero = LHSZero & RHSZero;
43330 KnownUndef = LHSUndef & RHSUndef;
43331 break;
43332 }
43333 case X86ISD::VZEXT_MOVL: {
43334 // If upper demanded elements are already zero then we have nothing to do.
43335 SDValue Src = Op.getOperand(0);
43336 APInt DemandedUpperElts = DemandedElts;
43337 DemandedUpperElts.clearLowBits(1);
43338 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43339 return TLO.CombineTo(Op, Src);
43340 break;
43341 }
43342 case X86ISD::VBROADCAST: {
43343 SDValue Src = Op.getOperand(0);
43344 MVT SrcVT = Src.getSimpleValueType();
43345 if (!SrcVT.isVector())
43346 break;
43347 // Don't bother broadcasting if we just need the 0'th element.
43348 if (DemandedElts == 1) {
43349 if (Src.getValueType() != VT)
43350 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43351 SDLoc(Op));
43352 return TLO.CombineTo(Op, Src);
43353 }
43354 APInt SrcUndef, SrcZero;
43355 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43356 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43357 Depth + 1))
43358 return true;
43359 // Aggressively peek through src to get at the demanded elt.
43360 // TODO - we should do this for all target/faux shuffles ops.
43361 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43362 Src, SrcElts, TLO.DAG, Depth + 1))
43363 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43364 break;
43365 }
43366 case X86ISD::VPERMV:
43367 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43368 Depth))
43369 return true;
43370 break;
43371 case X86ISD::PSHUFB:
43372 case X86ISD::VPERMV3:
43373 case X86ISD::VPERMILPV:
43374 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43375 Depth))
43376 return true;
43377 break;
43378 case X86ISD::VPPERM:
43379 case X86ISD::VPERMIL2:
43380 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43381 Depth))
43382 return true;
43383 break;
43384 }
43385
43386 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43387 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43388 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43389 if ((VT.is256BitVector() || VT.is512BitVector()) &&
43390 DemandedElts.lshr(NumElts / 2) == 0) {
43391 unsigned SizeInBits = VT.getSizeInBits();
43392 unsigned ExtSizeInBits = SizeInBits / 2;
43393
43394 // See if 512-bit ops only use the bottom 128-bits.
43395 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43396 ExtSizeInBits = SizeInBits / 4;
43397
43398 switch (Opc) {
43399 // Scalar broadcast.
43400 case X86ISD::VBROADCAST: {
43401 SDLoc DL(Op);
43402 SDValue Src = Op.getOperand(0);
43403 if (Src.getValueSizeInBits() > ExtSizeInBits)
43404 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43405 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43406 ExtSizeInBits / VT.getScalarSizeInBits());
43407 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43408 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43409 TLO.DAG, DL, ExtSizeInBits));
43410 }
43411 case X86ISD::VBROADCAST_LOAD: {
43412 SDLoc DL(Op);
43413 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43414 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43415 ExtSizeInBits / VT.getScalarSizeInBits());
43416 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43417 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43418 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43419 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43420 MemIntr->getMemOperand());
43421 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43422 Bcst.getValue(1));
43423 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43424 TLO.DAG, DL, ExtSizeInBits));
43425 }
43426 // Subvector broadcast.
43427 case X86ISD::SUBV_BROADCAST_LOAD: {
43428 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43429 EVT MemVT = MemIntr->getMemoryVT();
43430 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43431 SDLoc DL(Op);
43432 SDValue Ld =
43433 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43434 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43435 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43436 Ld.getValue(1));
43437 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43438 TLO.DAG, DL, ExtSizeInBits));
43439 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43440 SDLoc DL(Op);
43441 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43442 ExtSizeInBits / VT.getScalarSizeInBits());
43443 if (SDValue BcstLd =
43444 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43445 return TLO.CombineTo(Op,
43446 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43447 TLO.DAG, DL, ExtSizeInBits));
43448 }
43449 break;
43450 }
43451 // Byte shifts by immediate.
43452 case X86ISD::VSHLDQ:
43453 case X86ISD::VSRLDQ:
43454 // Shift by uniform.
43455 case X86ISD::VSHL:
43456 case X86ISD::VSRL:
43457 case X86ISD::VSRA:
43458 // Shift by immediate.
43459 case X86ISD::VSHLI:
43460 case X86ISD::VSRLI:
43461 case X86ISD::VSRAI: {
43462 SDLoc DL(Op);
43463 SDValue Ext0 =
43464 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43465 SDValue ExtOp =
43466 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43467 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43468 SDValue Insert =
43469 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43470 return TLO.CombineTo(Op, Insert);
43471 }
43472 case X86ISD::VPERMI: {
43473 // Simplify PERMPD/PERMQ to extract_subvector.
43474 // TODO: This should be done in shuffle combining.
43475 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43476 SmallVector<int, 4> Mask;
43477 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43478 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43479 SDLoc DL(Op);
43480 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43481 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43482 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43483 return TLO.CombineTo(Op, Insert);
43484 }
43485 }
43486 break;
43487 }
43488 case X86ISD::VPERM2X128: {
43489 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43490 SDLoc DL(Op);
43491 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43492 if (LoMask & 0x8)
43493 return TLO.CombineTo(
43494 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43495 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43496 unsigned SrcIdx = (LoMask & 0x2) >> 1;
43497 SDValue ExtOp =
43498 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43499 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43500 SDValue Insert =
43501 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43502 return TLO.CombineTo(Op, Insert);
43503 }
43504 // Zero upper elements.
43505 case X86ISD::VZEXT_MOVL:
43506 // Target unary shuffles by immediate:
43507 case X86ISD::PSHUFD:
43508 case X86ISD::PSHUFLW:
43509 case X86ISD::PSHUFHW:
43510 case X86ISD::VPERMILPI:
43511 // (Non-Lane Crossing) Target Shuffles.
43512 case X86ISD::VPERMILPV:
43513 case X86ISD::VPERMIL2:
43514 case X86ISD::PSHUFB:
43515 case X86ISD::UNPCKL:
43516 case X86ISD::UNPCKH:
43517 case X86ISD::BLENDI:
43518 // Integer ops.
43519 case X86ISD::PACKSS:
43520 case X86ISD::PACKUS:
43521 // Horizontal Ops.
43522 case X86ISD::HADD:
43523 case X86ISD::HSUB:
43524 case X86ISD::FHADD:
43525 case X86ISD::FHSUB: {
43526 SDLoc DL(Op);
43527 SmallVector<SDValue, 4> Ops;
43528 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43529 SDValue SrcOp = Op.getOperand(i);
43530 EVT SrcVT = SrcOp.getValueType();
43531 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))
43532 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))
;
43533 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43534 ExtSizeInBits)
43535 : SrcOp);
43536 }
43537 MVT ExtVT = VT.getSimpleVT();
43538 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43539 ExtSizeInBits / ExtVT.getScalarSizeInBits());
43540 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43541 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43542 SDValue Insert =
43543 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43544 return TLO.CombineTo(Op, Insert);
43545 }
43546 }
43547 }
43548
43549 // For splats, unless we *only* demand the 0'th element,
43550 // stop attempts at simplification here, we aren't going to improve things,
43551 // this is better than any potential shuffle.
43552 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43553 return false;
43554
43555 // Get target/faux shuffle mask.
43556 APInt OpUndef, OpZero;
43557 SmallVector<int, 64> OpMask;
43558 SmallVector<SDValue, 2> OpInputs;
43559 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43560 OpZero, TLO.DAG, Depth, false))
43561 return false;
43562
43563 // Shuffle inputs must be the same size as the result.
43564 if (OpMask.size() != (unsigned)NumElts ||
43565 llvm::any_of(OpInputs, [VT](SDValue V) {
43566 return VT.getSizeInBits() != V.getValueSizeInBits() ||
43567 !V.getValueType().isVector();
43568 }))
43569 return false;
43570
43571 KnownZero = OpZero;
43572 KnownUndef = OpUndef;
43573
43574 // Check if shuffle mask can be simplified to undef/zero/identity.
43575 int NumSrcs = OpInputs.size();
43576 for (int i = 0; i != NumElts; ++i)
43577 if (!DemandedElts[i])
43578 OpMask[i] = SM_SentinelUndef;
43579
43580 if (isUndefInRange(OpMask, 0, NumElts)) {
43581 KnownUndef.setAllBits();
43582 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43583 }
43584 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43585 KnownZero.setAllBits();
43586 return TLO.CombineTo(
43587 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43588 }
43589 for (int Src = 0; Src != NumSrcs; ++Src)
43590 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43591 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43592
43593 // Attempt to simplify inputs.
43594 for (int Src = 0; Src != NumSrcs; ++Src) {
43595 // TODO: Support inputs of different types.
43596 if (OpInputs[Src].getValueType() != VT)
43597 continue;
43598
43599 int Lo = Src * NumElts;
43600 APInt SrcElts = APInt::getZero(NumElts);
43601 for (int i = 0; i != NumElts; ++i)
43602 if (DemandedElts[i]) {
43603 int M = OpMask[i] - Lo;
43604 if (0 <= M && M < NumElts)
43605 SrcElts.setBit(M);
43606 }
43607
43608 // TODO - Propagate input undef/zero elts.
43609 APInt SrcUndef, SrcZero;
43610 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43611 TLO, Depth + 1))
43612 return true;
43613 }
43614
43615 // If we don't demand all elements, then attempt to combine to a simpler
43616 // shuffle.
43617 // We need to convert the depth to something combineX86ShufflesRecursively
43618 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43619 // to match. This prevents combineX86ShuffleChain from returning a
43620 // combined shuffle that's the same as the original root, causing an
43621 // infinite loop.
43622 if (!DemandedElts.isAllOnes()) {
43623 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43623, __extension__
__PRETTY_FUNCTION__))
;
43624
43625 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43626 for (int i = 0; i != NumElts; ++i)
43627 if (DemandedElts[i])
43628 DemandedMask[i] = i;
43629
43630 SDValue NewShuffle = combineX86ShufflesRecursively(
43631 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43632 /*HasVarMask*/ false,
43633 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43634 Subtarget);
43635 if (NewShuffle)
43636 return TLO.CombineTo(Op, NewShuffle);
43637 }
43638
43639 return false;
43640}
43641
43642bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
43643 SDValue Op, const APInt &OriginalDemandedBits,
43644 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43645 unsigned Depth) const {
43646 EVT VT = Op.getValueType();
43647 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43648 unsigned Opc = Op.getOpcode();
43649 switch(Opc) {
43650 case X86ISD::VTRUNC: {
43651 KnownBits KnownOp;
43652 SDValue Src = Op.getOperand(0);
43653 MVT SrcVT = Src.getSimpleValueType();
43654
43655 // Simplify the input, using demanded bit information.
43656 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43657 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43658 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43659 return true;
43660 break;
43661 }
43662 case X86ISD::PMULDQ:
43663 case X86ISD::PMULUDQ: {
43664 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43665 KnownBits KnownLHS, KnownRHS;
43666 SDValue LHS = Op.getOperand(0);
43667 SDValue RHS = Op.getOperand(1);
43668
43669 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43670 // FIXME: Can we bound this better?
43671 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43672 APInt DemandedMaskLHS = APInt::getAllOnes(64);
43673 APInt DemandedMaskRHS = APInt::getAllOnes(64);
43674
43675 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43676 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43677 DemandedMaskLHS = DemandedMask;
43678 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43679 DemandedMaskRHS = DemandedMask;
43680
43681 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43682 KnownLHS, TLO, Depth + 1))
43683 return true;
43684 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43685 KnownRHS, TLO, Depth + 1))
43686 return true;
43687
43688 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43689 KnownRHS = KnownRHS.trunc(32);
43690 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43691 KnownRHS.getConstant().isOne()) {
43692 SDLoc DL(Op);
43693 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43694 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43695 }
43696
43697 // Aggressively peek through ops to get at the demanded low bits.
43698 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
43699 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43700 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
43701 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43702 if (DemandedLHS || DemandedRHS) {
43703 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43704 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43705 return TLO.CombineTo(
43706 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43707 }
43708 break;
43709 }
43710 case X86ISD::VSHLI: {
43711 SDValue Op0 = Op.getOperand(0);
43712
43713 unsigned ShAmt = Op.getConstantOperandVal(1);
43714 if (ShAmt >= BitWidth)
43715 break;
43716
43717 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43718
43719 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43720 // single shift. We can do this if the bottom bits (which are shifted
43721 // out) are never demanded.
43722 if (Op0.getOpcode() == X86ISD::VSRLI &&
43723 OriginalDemandedBits.countr_zero() >= ShAmt) {
43724 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43725 if (Shift2Amt < BitWidth) {
43726 int Diff = ShAmt - Shift2Amt;
43727 if (Diff == 0)
43728 return TLO.CombineTo(Op, Op0.getOperand(0));
43729
43730 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43731 SDValue NewShift = TLO.DAG.getNode(
43732 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43733 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43734 return TLO.CombineTo(Op, NewShift);
43735 }
43736 }
43737
43738 // If we are only demanding sign bits then we can use the shift source directly.
43739 unsigned NumSignBits =
43740 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43741 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43742 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43743 return TLO.CombineTo(Op, Op0);
43744
43745 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43746 TLO, Depth + 1))
43747 return true;
43748
43749 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43749, __extension__
__PRETTY_FUNCTION__))
;
43750 Known.Zero <<= ShAmt;
43751 Known.One <<= ShAmt;
43752
43753 // Low bits known zero.
43754 Known.Zero.setLowBits(ShAmt);
43755 return false;
43756 }
43757 case X86ISD::VSRLI: {
43758 unsigned ShAmt = Op.getConstantOperandVal(1);
43759 if (ShAmt >= BitWidth)
43760 break;
43761
43762 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43763
43764 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
43765 OriginalDemandedElts, Known, TLO, Depth + 1))
43766 return true;
43767
43768 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43768, __extension__
__PRETTY_FUNCTION__))
;
43769 Known.Zero.lshrInPlace(ShAmt);
43770 Known.One.lshrInPlace(ShAmt);
43771
43772 // High bits known zero.
43773 Known.Zero.setHighBits(ShAmt);
43774 return false;
43775 }
43776 case X86ISD::VSRAI: {
43777 SDValue Op0 = Op.getOperand(0);
43778 SDValue Op1 = Op.getOperand(1);
43779
43780 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
43781 if (ShAmt >= BitWidth)
43782 break;
43783
43784 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43785
43786 // If we just want the sign bit then we don't need to shift it.
43787 if (OriginalDemandedBits.isSignMask())
43788 return TLO.CombineTo(Op, Op0);
43789
43790 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43791 if (Op0.getOpcode() == X86ISD::VSHLI &&
43792 Op.getOperand(1) == Op0.getOperand(1)) {
43793 SDValue Op00 = Op0.getOperand(0);
43794 unsigned NumSignBits =
43795 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
43796 if (ShAmt < NumSignBits)
43797 return TLO.CombineTo(Op, Op00);
43798 }
43799
43800 // If any of the demanded bits are produced by the sign extension, we also
43801 // demand the input sign bit.
43802 if (OriginalDemandedBits.countl_zero() < ShAmt)
43803 DemandedMask.setSignBit();
43804
43805 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43806 TLO, Depth + 1))
43807 return true;
43808
43809 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43809, __extension__
__PRETTY_FUNCTION__))
;
43810 Known.Zero.lshrInPlace(ShAmt);
43811 Known.One.lshrInPlace(ShAmt);
43812
43813 // If the input sign bit is known to be zero, or if none of the top bits
43814 // are demanded, turn this into an unsigned shift right.
43815 if (Known.Zero[BitWidth - ShAmt - 1] ||
43816 OriginalDemandedBits.countl_zero() >= ShAmt)
43817 return TLO.CombineTo(
43818 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
43819
43820 // High bits are known one.
43821 if (Known.One[BitWidth - ShAmt - 1])
43822 Known.One.setHighBits(ShAmt);
43823 return false;
43824 }
43825 case X86ISD::BLENDV: {
43826 SDValue Sel = Op.getOperand(0);
43827 SDValue LHS = Op.getOperand(1);
43828 SDValue RHS = Op.getOperand(2);
43829
43830 APInt SignMask = APInt::getSignMask(BitWidth);
43831 SDValue NewSel = SimplifyMultipleUseDemandedBits(
43832 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
43833 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
43834 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43835 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
43836 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43837
43838 if (NewSel || NewLHS || NewRHS) {
43839 NewSel = NewSel ? NewSel : Sel;
43840 NewLHS = NewLHS ? NewLHS : LHS;
43841 NewRHS = NewRHS ? NewRHS : RHS;
43842 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
43843 NewSel, NewLHS, NewRHS));
43844 }
43845 break;
43846 }
43847 case X86ISD::PEXTRB:
43848 case X86ISD::PEXTRW: {
43849 SDValue Vec = Op.getOperand(0);
43850 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
43851 MVT VecVT = Vec.getSimpleValueType();
43852 unsigned NumVecElts = VecVT.getVectorNumElements();
43853
43854 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
43855 unsigned Idx = CIdx->getZExtValue();
43856 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
43857
43858 // If we demand no bits from the vector then we must have demanded
43859 // bits from the implict zext - simplify to zero.
43860 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
43861 if (DemandedVecBits == 0)
43862 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43863
43864 APInt KnownUndef, KnownZero;
43865 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
43866 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
43867 KnownZero, TLO, Depth + 1))
43868 return true;
43869
43870 KnownBits KnownVec;
43871 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
43872 KnownVec, TLO, Depth + 1))
43873 return true;
43874
43875 if (SDValue V = SimplifyMultipleUseDemandedBits(
43876 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
43877 return TLO.CombineTo(
43878 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
43879
43880 Known = KnownVec.zext(BitWidth);
43881 return false;
43882 }
43883 break;
43884 }
43885 case X86ISD::PINSRB:
43886 case X86ISD::PINSRW: {
43887 SDValue Vec = Op.getOperand(0);
43888 SDValue Scl = Op.getOperand(1);
43889 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43890 MVT VecVT = Vec.getSimpleValueType();
43891
43892 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
43893 unsigned Idx = CIdx->getZExtValue();
43894 if (!OriginalDemandedElts[Idx])
43895 return TLO.CombineTo(Op, Vec);
43896
43897 KnownBits KnownVec;
43898 APInt DemandedVecElts(OriginalDemandedElts);
43899 DemandedVecElts.clearBit(Idx);
43900 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
43901 KnownVec, TLO, Depth + 1))
43902 return true;
43903
43904 KnownBits KnownScl;
43905 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
43906 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
43907 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
43908 return true;
43909
43910 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
43911 Known = KnownBits::commonBits(KnownVec, KnownScl);
43912 return false;
43913 }
43914 break;
43915 }
43916 case X86ISD::PACKSS:
43917 // PACKSS saturates to MIN/MAX integer values. So if we just want the
43918 // sign bit then we can just ask for the source operands sign bit.
43919 // TODO - add known bits handling.
43920 if (OriginalDemandedBits.isSignMask()) {
43921 APInt DemandedLHS, DemandedRHS;
43922 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
43923
43924 KnownBits KnownLHS, KnownRHS;
43925 APInt SignMask = APInt::getSignMask(BitWidth * 2);
43926 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
43927 KnownLHS, TLO, Depth + 1))
43928 return true;
43929 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
43930 KnownRHS, TLO, Depth + 1))
43931 return true;
43932
43933 // Attempt to avoid multi-use ops if we don't need anything from them.
43934 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43935 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
43936 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
43937 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
43938 if (DemandedOp0 || DemandedOp1) {
43939 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
43940 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
43941 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
43942 }
43943 }
43944 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
43945 break;
43946 case X86ISD::VBROADCAST: {
43947 SDValue Src = Op.getOperand(0);
43948 MVT SrcVT = Src.getSimpleValueType();
43949 APInt DemandedElts = APInt::getOneBitSet(
43950 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
43951 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
43952 TLO, Depth + 1))
43953 return true;
43954 // If we don't need the upper bits, attempt to narrow the broadcast source.
43955 // Don't attempt this on AVX512 as it might affect broadcast folding.
43956 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
43957 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
43958 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
43959 Src->hasOneUse()) {
43960 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
43961 SDValue NewSrc =
43962 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
43963 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
43964 SDValue NewBcst =
43965 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
43966 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
43967 }
43968 break;
43969 }
43970 case X86ISD::PCMPGT:
43971 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43972 // iff we only need the sign bit then we can use R directly.
43973 if (OriginalDemandedBits.isSignMask() &&
43974 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43975 return TLO.CombineTo(Op, Op.getOperand(1));
43976 break;
43977 case X86ISD::MOVMSK: {
43978 SDValue Src = Op.getOperand(0);
43979 MVT SrcVT = Src.getSimpleValueType();
43980 unsigned SrcBits = SrcVT.getScalarSizeInBits();
43981 unsigned NumElts = SrcVT.getVectorNumElements();
43982
43983 // If we don't need the sign bits at all just return zero.
43984 if (OriginalDemandedBits.countr_zero() >= NumElts)
43985 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43986
43987 // See if we only demand bits from the lower 128-bit vector.
43988 if (SrcVT.is256BitVector() &&
43989 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
43990 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
43991 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43992 }
43993
43994 // Only demand the vector elements of the sign bits we need.
43995 APInt KnownUndef, KnownZero;
43996 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
43997 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43998 TLO, Depth + 1))
43999 return true;
44000
44001 Known.Zero = KnownZero.zext(BitWidth);
44002 Known.Zero.setHighBits(BitWidth - NumElts);
44003
44004 // MOVMSK only uses the MSB from each vector element.
44005 KnownBits KnownSrc;
44006 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44007 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44008 Depth + 1))
44009 return true;
44010
44011 if (KnownSrc.One[SrcBits - 1])
44012 Known.One.setLowBits(NumElts);
44013 else if (KnownSrc.Zero[SrcBits - 1])
44014 Known.Zero.setLowBits(NumElts);
44015
44016 // Attempt to avoid multi-use os if we don't need anything from it.
44017 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
44018 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44019 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44020 return false;
44021 }
44022 case X86ISD::TESTP: {
44023 SDValue Op0 = Op.getOperand(0);
44024 SDValue Op1 = Op.getOperand(1);
44025 MVT OpVT = Op0.getSimpleValueType();
44026 assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))
44027 OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))
44028 "Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))
;
44029
44030 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44031 KnownBits KnownSrc;
44032 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44033 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44034 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44035 AssumeSingleUse) ||
44036 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44037 AssumeSingleUse);
44038 }
44039 case X86ISD::BEXTR:
44040 case X86ISD::BEXTRI: {
44041 SDValue Op0 = Op.getOperand(0);
44042 SDValue Op1 = Op.getOperand(1);
44043
44044 // Only bottom 16-bits of the control bits are required.
44045 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44046 // NOTE: SimplifyDemandedBits won't do this for constants.
44047 uint64_t Val1 = Cst1->getZExtValue();
44048 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44049 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44050 SDLoc DL(Op);
44051 return TLO.CombineTo(
44052 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44053 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44054 }
44055
44056 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44057 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44058
44059 // If the length is 0, the result is 0.
44060 if (Length == 0) {
44061 Known.setAllZero();
44062 return false;
44063 }
44064
44065 if ((Shift + Length) <= BitWidth) {
44066 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44067 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44068 return true;
44069
44070 Known = Known.extractBits(Length, Shift);
44071 Known = Known.zextOrTrunc(BitWidth);
44072 return false;
44073 }
44074 } else {
44075 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44075, __extension__
__PRETTY_FUNCTION__))
;
44076 KnownBits Known1;
44077 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44078 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44079 return true;
44080
44081 // If the length is 0, replace with 0.
44082 KnownBits LengthBits = Known1.extractBits(8, 8);
44083 if (LengthBits.isZero())
44084 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44085 }
44086
44087 break;
44088 }
44089 case X86ISD::PDEP: {
44090 SDValue Op0 = Op.getOperand(0);
44091 SDValue Op1 = Op.getOperand(1);
44092
44093 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44094 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44095
44096 // If the demanded bits has leading zeroes, we don't demand those from the
44097 // mask.
44098 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44099 return true;
44100
44101 // The number of possible 1s in the mask determines the number of LSBs of
44102 // operand 0 used. Undemanded bits from the mask don't matter so filter
44103 // them before counting.
44104 KnownBits Known2;
44105 uint64_t Count = (~Known.Zero & LoMask).popcount();
44106 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44107 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44108 return true;
44109
44110 // Zeroes are retained from the mask, but not ones.
44111 Known.One.clearAllBits();
44112 // The result will have at least as many trailing zeros as the non-mask
44113 // operand since bits can only map to the same or higher bit position.
44114 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44115 return false;
44116 }
44117 }
44118
44119 return TargetLowering::SimplifyDemandedBitsForTargetNode(
44120 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44121}
44122
44123SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
44124 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
44125 SelectionDAG &DAG, unsigned Depth) const {
44126 int NumElts = DemandedElts.getBitWidth();
44127 unsigned Opc = Op.getOpcode();
44128 EVT VT = Op.getValueType();
44129
44130 switch (Opc) {
44131 case X86ISD::PINSRB:
44132 case X86ISD::PINSRW: {
44133 // If we don't demand the inserted element, return the base vector.
44134 SDValue Vec = Op.getOperand(0);
44135 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44136 MVT VecVT = Vec.getSimpleValueType();
44137 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44138 !DemandedElts[CIdx->getZExtValue()])
44139 return Vec;
44140 break;
44141 }
44142 case X86ISD::VSHLI: {
44143 // If we are only demanding sign bits then we can use the shift source
44144 // directly.
44145 SDValue Op0 = Op.getOperand(0);
44146 unsigned ShAmt = Op.getConstantOperandVal(1);
44147 unsigned BitWidth = DemandedBits.getBitWidth();
44148 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44149 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44150 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44151 return Op0;
44152 break;
44153 }
44154 case X86ISD::VSRAI:
44155 // iff we only need the sign bit then we can use the source directly.
44156 // TODO: generalize where we only demand extended signbits.
44157 if (DemandedBits.isSignMask())
44158 return Op.getOperand(0);
44159 break;
44160 case X86ISD::PCMPGT:
44161 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44162 // iff we only need the sign bit then we can use R directly.
44163 if (DemandedBits.isSignMask() &&
44164 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44165 return Op.getOperand(1);
44166 break;
44167 case X86ISD::ANDNP: {
44168 // ANDNP = (~LHS & RHS);
44169 SDValue LHS = Op.getOperand(0);
44170 SDValue RHS = Op.getOperand(1);
44171
44172 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44173 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44174
44175 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
44176 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
44177 // this context, so return RHS.
44178 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
44179 return RHS;
44180 break;
44181 }
44182 }
44183
44184 APInt ShuffleUndef, ShuffleZero;
44185 SmallVector<int, 16> ShuffleMask;
44186 SmallVector<SDValue, 2> ShuffleOps;
44187 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
44188 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44189 // If all the demanded elts are from one operand and are inline,
44190 // then we can use the operand directly.
44191 int NumOps = ShuffleOps.size();
44192 if (ShuffleMask.size() == (unsigned)NumElts &&
44193 llvm::all_of(ShuffleOps, [VT](SDValue V) {
44194 return VT.getSizeInBits() == V.getValueSizeInBits();
44195 })) {
44196
44197 if (DemandedElts.isSubsetOf(ShuffleUndef))
44198 return DAG.getUNDEF(VT);
44199 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
44200 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
44201
44202 // Bitmask that indicates which ops have only been accessed 'inline'.
44203 APInt IdentityOp = APInt::getAllOnes(NumOps);
44204 for (int i = 0; i != NumElts; ++i) {
44205 int M = ShuffleMask[i];
44206 if (!DemandedElts[i] || ShuffleUndef[i])
44207 continue;
44208 int OpIdx = M / NumElts;
44209 int EltIdx = M % NumElts;
44210 if (M < 0 || EltIdx != i) {
44211 IdentityOp.clearAllBits();
44212 break;
44213 }
44214 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
44215 if (IdentityOp == 0)
44216 break;
44217 }
44218 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__
__PRETTY_FUNCTION__))
44219 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__
__PRETTY_FUNCTION__))
;
44220
44221 if (IdentityOp != 0)
44222 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
44223 }
44224 }
44225
44226 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
44227 Op, DemandedBits, DemandedElts, DAG, Depth);
44228}
44229
44230bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
44231 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44232 bool PoisonOnly, unsigned Depth) const {
44233 unsigned EltsBits = Op.getScalarValueSizeInBits();
44234 unsigned NumElts = DemandedElts.getBitWidth();
44235
44236 // TODO: Add more target shuffles.
44237 switch (Op.getOpcode()) {
44238 case X86ISD::PSHUFD:
44239 case X86ISD::VPERMILPI: {
44240 SmallVector<int, 8> Mask;
44241 DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
44242
44243 APInt DemandedSrcElts = APInt::getZero(NumElts);
44244 for (unsigned I = 0; I != NumElts; ++I)
44245 if (DemandedElts[I])
44246 DemandedSrcElts.setBit(Mask[I]);
44247
44248 return DAG.isGuaranteedNotToBeUndefOrPoison(
44249 Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
44250 }
44251 }
44252 return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
44253 Op, DemandedElts, DAG, PoisonOnly, Depth);
44254}
44255
44256bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
44257 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44258 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44259
44260 // TODO: Add more target shuffles.
44261 switch (Op.getOpcode()) {
44262 case X86ISD::PSHUFD:
44263 case X86ISD::VPERMILPI:
44264 return false;
44265 }
44266 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
44267 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44268}
44269
44270bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
44271 const APInt &DemandedElts,
44272 APInt &UndefElts,
44273 const SelectionDAG &DAG,
44274 unsigned Depth) const {
44275 unsigned NumElts = DemandedElts.getBitWidth();
44276 unsigned Opc = Op.getOpcode();
44277
44278 switch (Opc) {
44279 case X86ISD::VBROADCAST:
44280 case X86ISD::VBROADCAST_LOAD:
44281 UndefElts = APInt::getZero(NumElts);
44282 return true;
44283 }
44284
44285 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44286 DAG, Depth);
44287}
44288
44289// Helper to peek through bitops/trunc/setcc to determine size of source vector.
44290// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44291static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44292 bool AllowTruncate) {
44293 switch (Src.getOpcode()) {
44294 case ISD::TRUNCATE:
44295 if (!AllowTruncate)
44296 return false;
44297 [[fallthrough]];
44298 case ISD::SETCC:
44299 return Src.getOperand(0).getValueSizeInBits() == Size;
44300 case ISD::AND:
44301 case ISD::XOR:
44302 case ISD::OR:
44303 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44304 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44305 case ISD::SELECT:
44306 case ISD::VSELECT:
44307 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44308 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44309 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44310 case ISD::BUILD_VECTOR:
44311 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44312 ISD::isBuildVectorAllOnes(Src.getNode());
44313 }
44314 return false;
44315}
44316
44317// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44318static unsigned getAltBitOpcode(unsigned Opcode) {
44319 switch(Opcode) {
44320 case ISD::AND: return X86ISD::FAND;
44321 case ISD::OR: return X86ISD::FOR;
44322 case ISD::XOR: return X86ISD::FXOR;
44323 case X86ISD::ANDNP: return X86ISD::FANDN;
44324 }
44325 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44325)
;
44326}
44327
44328// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44329static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
44330 const SDLoc &DL) {
44331 EVT SrcVT = Src.getValueType();
44332 if (SrcVT != MVT::v4i1)
44333 return SDValue();
44334
44335 switch (Src.getOpcode()) {
44336 case ISD::SETCC:
44337 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44338 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44339 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44340 SDValue Op0 = Src.getOperand(0);
44341 if (ISD::isNormalLoad(Op0.getNode()))
44342 return DAG.getBitcast(MVT::v4f32, Op0);
44343 if (Op0.getOpcode() == ISD::BITCAST &&
44344 Op0.getOperand(0).getValueType() == MVT::v4f32)
44345 return Op0.getOperand(0);
44346 }
44347 break;
44348 case ISD::AND:
44349 case ISD::XOR:
44350 case ISD::OR: {
44351 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44352 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44353 if (Op0 && Op1)
44354 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44355 Op1);
44356 break;
44357 }
44358 }
44359 return SDValue();
44360}
44361
44362// Helper to push sign extension of vXi1 SETCC result through bitops.
44363static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
44364 SDValue Src, const SDLoc &DL) {
44365 switch (Src.getOpcode()) {
44366 case ISD::SETCC:
44367 case ISD::TRUNCATE:
44368 case ISD::BUILD_VECTOR:
44369 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44370 case ISD::AND:
44371 case ISD::XOR:
44372 case ISD::OR:
44373 return DAG.getNode(
44374 Src.getOpcode(), DL, SExtVT,
44375 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44376 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44377 case ISD::SELECT:
44378 case ISD::VSELECT:
44379 return DAG.getSelect(
44380 DL, SExtVT, Src.getOperand(0),
44381 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44382 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44383 }
44384 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44384)
;
44385}
44386
44387// Try to match patterns such as
44388// (i16 bitcast (v16i1 x))
44389// ->
44390// (i16 movmsk (16i8 sext (v16i1 x)))
44391// before the illegal vector is scalarized on subtargets that don't have legal
44392// vxi1 types.
44393static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
44394 const SDLoc &DL,
44395 const X86Subtarget &Subtarget) {
44396 EVT SrcVT = Src.getValueType();
44397 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44398 return SDValue();
44399
44400 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44401 // legalization destroys the v4i32 type.
44402 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44403 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44404 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44405 DAG.getBitcast(MVT::v4f32, V));
44406 return DAG.getZExtOrTrunc(V, DL, VT);
44407 }
44408 }
44409
44410 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44411 // movmskb even with avx512. This will be better than truncating to vXi1 and
44412 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44413 // vpcmpeqb/vpcmpgtb.
44414 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44415 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44416 Src.getOperand(0).getValueType() == MVT::v32i8 ||
44417 Src.getOperand(0).getValueType() == MVT::v64i8);
44418
44419 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44420 // directly with vpmovmskb/vmovmskps/vmovmskpd.
44421 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44422 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44423 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44424 EVT CmpVT = Src.getOperand(0).getValueType();
44425 EVT EltVT = CmpVT.getVectorElementType();
44426 if (CmpVT.getSizeInBits() <= 256 &&
44427 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44428 PreferMovMsk = true;
44429 }
44430
44431 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44432 // MOVMSK is supported in SSE2 or later.
44433 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44434 return SDValue();
44435
44436 // If the upper ops of a concatenation are undef, then try to bitcast the
44437 // lower op and extend.
44438 SmallVector<SDValue, 4> SubSrcOps;
44439 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44440 SubSrcOps.size() >= 2) {
44441 SDValue LowerOp = SubSrcOps[0];
44442 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44443 if (LowerOp.getOpcode() == ISD::SETCC &&
44444 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44445 EVT SubVT = VT.getIntegerVT(
44446 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44447 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44448 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44449 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44450 }
44451 }
44452 }
44453
44454 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44455 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44456 // v8i16 and v16i16.
44457 // For these two cases, we can shuffle the upper element bytes to a
44458 // consecutive sequence at the start of the vector and treat the results as
44459 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44460 // for v16i16 this is not the case, because the shuffle is expensive, so we
44461 // avoid sign-extending to this type entirely.
44462 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44463 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44464 MVT SExtVT;
44465 bool PropagateSExt = false;
44466 switch (SrcVT.getSimpleVT().SimpleTy) {
44467 default:
44468 return SDValue();
44469 case MVT::v2i1:
44470 SExtVT = MVT::v2i64;
44471 break;
44472 case MVT::v4i1:
44473 SExtVT = MVT::v4i32;
44474 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44475 // sign-extend to a 256-bit operation to avoid truncation.
44476 if (Subtarget.hasAVX() &&
44477 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44478 SExtVT = MVT::v4i64;
44479 PropagateSExt = true;
44480 }
44481 break;
44482 case MVT::v8i1:
44483 SExtVT = MVT::v8i16;
44484 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44485 // sign-extend to a 256-bit operation to match the compare.
44486 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44487 // 256-bit because the shuffle is cheaper than sign extending the result of
44488 // the compare.
44489 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44490 checkBitcastSrcVectorSize(Src, 512, true))) {
44491 SExtVT = MVT::v8i32;
44492 PropagateSExt = true;
44493 }
44494 break;
44495 case MVT::v16i1:
44496 SExtVT = MVT::v16i8;
44497 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44498 // it is not profitable to sign-extend to 256-bit because this will
44499 // require an extra cross-lane shuffle which is more expensive than
44500 // truncating the result of the compare to 128-bits.
44501 break;
44502 case MVT::v32i1:
44503 SExtVT = MVT::v32i8;
44504 break;
44505 case MVT::v64i1:
44506 // If we have AVX512F, but not AVX512BW and the input is truncated from
44507 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44508 if (Subtarget.hasAVX512()) {
44509 if (Subtarget.hasBWI())
44510 return SDValue();
44511 SExtVT = MVT::v64i8;
44512 break;
44513 }
44514 // Split if this is a <64 x i8> comparison result.
44515 if (checkBitcastSrcVectorSize(Src, 512, false)) {
44516 SExtVT = MVT::v64i8;
44517 break;
44518 }
44519 return SDValue();
44520 };
44521
44522 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44523 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44524
44525 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44526 V = getPMOVMSKB(DL, V, DAG, Subtarget);
44527 } else {
44528 if (SExtVT == MVT::v8i16)
44529 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
44530 DAG.getUNDEF(MVT::v8i16));
44531 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44532 }
44533
44534 EVT IntVT =
44535 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
44536 V = DAG.getZExtOrTrunc(V, DL, IntVT);
44537 return DAG.getBitcast(VT, V);
44538}
44539
44540// Convert a vXi1 constant build vector to the same width scalar integer.
44541static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
44542 EVT SrcVT = Op.getValueType();
44543 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__
__PRETTY_FUNCTION__))
44544 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__
__PRETTY_FUNCTION__))
;
44545 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__
__PRETTY_FUNCTION__))
44546 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__
__PRETTY_FUNCTION__))
;
44547
44548 APInt Imm(SrcVT.getVectorNumElements(), 0);
44549 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44550 SDValue In = Op.getOperand(Idx);
44551 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
44552 Imm.setBit(Idx);
44553 }
44554 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44555 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44556}
44557
44558static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44559 TargetLowering::DAGCombinerInfo &DCI,
44560 const X86Subtarget &Subtarget) {
44561 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44561, __extension__
__PRETTY_FUNCTION__))
;
44562
44563 if (!DCI.isBeforeLegalizeOps())
44564 return SDValue();
44565
44566 // Only do this if we have k-registers.
44567 if (!Subtarget.hasAVX512())
44568 return SDValue();
44569
44570 EVT DstVT = N->getValueType(0);
44571 SDValue Op = N->getOperand(0);
44572 EVT SrcVT = Op.getValueType();
44573
44574 if (!Op.hasOneUse())
44575 return SDValue();
44576
44577 // Look for logic ops.
44578 if (Op.getOpcode() != ISD::AND &&
44579 Op.getOpcode() != ISD::OR &&
44580 Op.getOpcode() != ISD::XOR)
44581 return SDValue();
44582
44583 // Make sure we have a bitcast between mask registers and a scalar type.
44584 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44585 DstVT.isScalarInteger()) &&
44586 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44587 SrcVT.isScalarInteger()))
44588 return SDValue();
44589
44590 SDValue LHS = Op.getOperand(0);
44591 SDValue RHS = Op.getOperand(1);
44592
44593 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44594 LHS.getOperand(0).getValueType() == DstVT)
44595 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44596 DAG.getBitcast(DstVT, RHS));
44597
44598 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44599 RHS.getOperand(0).getValueType() == DstVT)
44600 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44601 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44602
44603 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44604 // Most of these have to move a constant from the scalar domain anyway.
44605 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
44606 RHS = combinevXi1ConstantToInteger(RHS, DAG);
44607 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44608 DAG.getBitcast(DstVT, LHS), RHS);
44609 }
44610
44611 return SDValue();
44612}
44613
44614static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
44615 const X86Subtarget &Subtarget) {
44616 SDLoc DL(BV);
44617 unsigned NumElts = BV->getNumOperands();
44618 SDValue Splat = BV->getSplatValue();
44619
44620 // Build MMX element from integer GPR or SSE float values.
44621 auto CreateMMXElement = [&](SDValue V) {
44622 if (V.isUndef())
44623 return DAG.getUNDEF(MVT::x86mmx);
44624 if (V.getValueType().isFloatingPoint()) {
44625 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44626 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44627 V = DAG.getBitcast(MVT::v2i64, V);
44628 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44629 }
44630 V = DAG.getBitcast(MVT::i32, V);
44631 } else {
44632 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44633 }
44634 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44635 };
44636
44637 // Convert build vector ops to MMX data in the bottom elements.
44638 SmallVector<SDValue, 8> Ops;
44639
44640 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44641
44642 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44643 if (Splat) {
44644 if (Splat.isUndef())
44645 return DAG.getUNDEF(MVT::x86mmx);
44646
44647 Splat = CreateMMXElement(Splat);
44648
44649 if (Subtarget.hasSSE1()) {
44650 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44651 if (NumElts == 8)
44652 Splat = DAG.getNode(
44653 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44654 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44655 TLI.getPointerTy(DAG.getDataLayout())),
44656 Splat, Splat);
44657
44658 // Use PSHUFW to repeat 16-bit elements.
44659 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44660 return DAG.getNode(
44661 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44662 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44663 TLI.getPointerTy(DAG.getDataLayout())),
44664 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44665 }
44666 Ops.append(NumElts, Splat);
44667 } else {
44668 for (unsigned i = 0; i != NumElts; ++i)
44669 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44670 }
44671
44672 // Use tree of PUNPCKLs to build up general MMX vector.
44673 while (Ops.size() > 1) {
44674 unsigned NumOps = Ops.size();
44675 unsigned IntrinOp =
44676 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44677 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44678 : Intrinsic::x86_mmx_punpcklbw));
44679 SDValue Intrin = DAG.getTargetConstant(
44680 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44681 for (unsigned i = 0; i != NumOps; i += 2)
44682 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44683 Ops[i], Ops[i + 1]);
44684 Ops.resize(NumOps / 2);
44685 }
44686
44687 return Ops[0];
44688}
44689
44690// Recursive function that attempts to find if a bool vector node was originally
44691// a vector/float/double that got truncated/extended/bitcast to/from a scalar
44692// integer. If so, replace the scalar ops with bool vector equivalents back down
44693// the chain.
44694static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
44695 SelectionDAG &DAG,
44696 const X86Subtarget &Subtarget) {
44697 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44698 unsigned Opc = V.getOpcode();
44699 switch (Opc) {
44700 case ISD::BITCAST: {
44701 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
44702 SDValue Src = V.getOperand(0);
44703 EVT SrcVT = Src.getValueType();
44704 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
44705 return DAG.getBitcast(VT, Src);
44706 break;
44707 }
44708 case ISD::TRUNCATE: {
44709 // If we find a suitable source, a truncated scalar becomes a subvector.
44710 SDValue Src = V.getOperand(0);
44711 EVT NewSrcVT =
44712 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
44713 if (TLI.isTypeLegal(NewSrcVT))
44714 if (SDValue N0 =
44715 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44716 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
44717 DAG.getIntPtrConstant(0, DL));
44718 break;
44719 }
44720 case ISD::ANY_EXTEND:
44721 case ISD::ZERO_EXTEND: {
44722 // If we find a suitable source, an extended scalar becomes a subvector.
44723 SDValue Src = V.getOperand(0);
44724 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
44725 Src.getScalarValueSizeInBits());
44726 if (TLI.isTypeLegal(NewSrcVT))
44727 if (SDValue N0 =
44728 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44729 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
44730 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
44731 : DAG.getConstant(0, DL, VT),
44732 N0, DAG.getIntPtrConstant(0, DL));
44733 break;
44734 }
44735 case ISD::OR: {
44736 // If we find suitable sources, we can just move an OR to the vector domain.
44737 SDValue Src0 = V.getOperand(0);
44738 SDValue Src1 = V.getOperand(1);
44739 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44740 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
44741 return DAG.getNode(Opc, DL, VT, N0, N1);
44742 break;
44743 }
44744 case ISD::SHL: {
44745 // If we find a suitable source, a SHL becomes a KSHIFTL.
44746 SDValue Src0 = V.getOperand(0);
44747 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
44748 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
44749 break;
44750
44751 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
44752 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44753 return DAG.getNode(
44754 X86ISD::KSHIFTL, DL, VT, N0,
44755 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
44756 break;
44757 }
44758 }
44759 return SDValue();
44760}
44761
44762static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
44763 TargetLowering::DAGCombinerInfo &DCI,
44764 const X86Subtarget &Subtarget) {
44765 SDValue N0 = N->getOperand(0);
44766 EVT VT = N->getValueType(0);
44767 EVT SrcVT = N0.getValueType();
44768 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44769
44770 // Try to match patterns such as
44771 // (i16 bitcast (v16i1 x))
44772 // ->
44773 // (i16 movmsk (16i8 sext (v16i1 x)))
44774 // before the setcc result is scalarized on subtargets that don't have legal
44775 // vxi1 types.
44776 if (DCI.isBeforeLegalize()) {
44777 SDLoc dl(N);
44778 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
44779 return V;
44780
44781 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44782 // type, widen both sides to avoid a trip through memory.
44783 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
44784 Subtarget.hasAVX512()) {
44785 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
44786 N0 = DAG.getBitcast(MVT::v8i1, N0);
44787 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
44788 DAG.getIntPtrConstant(0, dl));
44789 }
44790
44791 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44792 // type, widen both sides to avoid a trip through memory.
44793 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
44794 Subtarget.hasAVX512()) {
44795 // Use zeros for the widening if we already have some zeroes. This can
44796 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
44797 // stream of this.
44798 // FIXME: It might make sense to detect a concat_vectors with a mix of
44799 // zeroes and undef and turn it into insert_subvector for i1 vectors as
44800 // a separate combine. What we can't do is canonicalize the operands of
44801 // such a concat or we'll get into a loop with SimplifyDemandedBits.
44802 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
44803 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
44804 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
44805 SrcVT = LastOp.getValueType();
44806 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44807 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
44808 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
44809 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44810 N0 = DAG.getBitcast(MVT::i8, N0);
44811 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44812 }
44813 }
44814
44815 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44816 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
44817 Ops[0] = N0;
44818 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44819 N0 = DAG.getBitcast(MVT::i8, N0);
44820 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44821 }
44822 } else {
44823 // If we're bitcasting from iX to vXi1, see if the integer originally
44824 // began as a vXi1 and whether we can remove the bitcast entirely.
44825 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
44826 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
44827 if (SDValue V =
44828 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
44829 return V;
44830 }
44831 }
44832
44833 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
44834 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
44835 // due to insert_subvector legalization on KNL. By promoting the copy to i16
44836 // we can help with known bits propagation from the vXi1 domain to the
44837 // scalar domain.
44838 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
44839 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44840 N0.getOperand(0).getValueType() == MVT::v16i1 &&
44841 isNullConstant(N0.getOperand(1)))
44842 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
44843 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
44844
44845 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
44846 // and the vbroadcast_load are both integer or both fp. In some cases this
44847 // will remove the bitcast entirely.
44848 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
44849 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
44850 auto *BCast = cast<MemIntrinsicSDNode>(N0);
44851 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
44852 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
44853 // Don't swap i8/i16 since don't have fp types that size.
44854 if (MemSize >= 32) {
44855 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
44856 : MVT::getIntegerVT(MemSize);
44857 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
44858 : MVT::getIntegerVT(SrcVTSize);
44859 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
44860
44861 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
44862 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
44863 SDValue ResNode =
44864 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
44865 MemVT, BCast->getMemOperand());
44866 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
44867 return DAG.getBitcast(VT, ResNode);
44868 }
44869 }
44870
44871 // Since MMX types are special and don't usually play with other vector types,
44872 // it's better to handle them early to be sure we emit efficient code by
44873 // avoiding store-load conversions.
44874 if (VT == MVT::x86mmx) {
44875 // Detect MMX constant vectors.
44876 APInt UndefElts;
44877 SmallVector<APInt, 1> EltBits;
44878 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
44879 SDLoc DL(N0);
44880 // Handle zero-extension of i32 with MOVD.
44881 if (EltBits[0].countl_zero() >= 32)
44882 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
44883 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
44884 // Else, bitcast to a double.
44885 // TODO - investigate supporting sext 32-bit immediates on x86_64.
44886 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
44887 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
44888 }
44889
44890 // Detect bitcasts to x86mmx low word.
44891 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44892 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
44893 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
44894 bool LowUndef = true, AllUndefOrZero = true;
44895 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
44896 SDValue Op = N0.getOperand(i);
44897 LowUndef &= Op.isUndef() || (i >= e/2);
44898 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
44899 }
44900 if (AllUndefOrZero) {
44901 SDValue N00 = N0.getOperand(0);
44902 SDLoc dl(N00);
44903 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
44904 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
44905 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
44906 }
44907 }
44908
44909 // Detect bitcasts of 64-bit build vectors and convert to a
44910 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
44911 // lowest element.
44912 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44913 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
44914 SrcVT == MVT::v8i8))
44915 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
44916
44917 // Detect bitcasts between element or subvector extraction to x86mmx.
44918 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
44919 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
44920 isNullConstant(N0.getOperand(1))) {
44921 SDValue N00 = N0.getOperand(0);
44922 if (N00.getValueType().is128BitVector())
44923 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
44924 DAG.getBitcast(MVT::v2i64, N00));
44925 }
44926
44927 // Detect bitcasts from FP_TO_SINT to x86mmx.
44928 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
44929 SDLoc DL(N0);
44930 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
44931 DAG.getUNDEF(MVT::v2i32));
44932 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
44933 DAG.getBitcast(MVT::v2i64, Res));
44934 }
44935 }
44936
44937 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
44938 // most of these to scalar anyway.
44939 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
44940 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44941 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
44942 return combinevXi1ConstantToInteger(N0, DAG);
44943 }
44944
44945 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44946 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44947 isa<ConstantSDNode>(N0)) {
44948 auto *C = cast<ConstantSDNode>(N0);
44949 if (C->isAllOnes())
44950 return DAG.getConstant(1, SDLoc(N0), VT);
44951 if (C->isZero())
44952 return DAG.getConstant(0, SDLoc(N0), VT);
44953 }
44954
44955 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
44956 // Turn it into a sign bit compare that produces a k-register. This avoids
44957 // a trip through a GPR.
44958 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44959 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44960 isPowerOf2_32(VT.getVectorNumElements())) {
44961 unsigned NumElts = VT.getVectorNumElements();
44962 SDValue Src = N0;
44963
44964 // Peek through truncate.
44965 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
44966 Src = N0.getOperand(0);
44967
44968 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
44969 SDValue MovmskIn = Src.getOperand(0);
44970 MVT MovmskVT = MovmskIn.getSimpleValueType();
44971 unsigned MovMskElts = MovmskVT.getVectorNumElements();
44972
44973 // We allow extra bits of the movmsk to be used since they are known zero.
44974 // We can't convert a VPMOVMSKB without avx512bw.
44975 if (MovMskElts <= NumElts &&
44976 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
44977 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
44978 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
44979 SDLoc dl(N);
44980 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
44981 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
44982 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
44983 if (EVT(CmpVT) == VT)
44984 return Cmp;
44985
44986 // Pad with zeroes up to original VT to replace the zeroes that were
44987 // being used from the MOVMSK.
44988 unsigned NumConcats = NumElts / MovMskElts;
44989 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
44990 Ops[0] = Cmp;
44991 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
44992 }
44993 }
44994 }
44995
44996 // Try to remove bitcasts from input and output of mask arithmetic to
44997 // remove GPR<->K-register crossings.
44998 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
44999 return V;
45000
45001 // Convert a bitcasted integer logic operation that has one bitcasted
45002 // floating-point operand into a floating-point logic operation. This may
45003 // create a load of a constant, but that is cheaper than materializing the
45004 // constant in an integer register and transferring it to an SSE register or
45005 // transferring the SSE operand to integer register and back.
45006 unsigned FPOpcode;
45007 switch (N0.getOpcode()) {
45008 case ISD::AND: FPOpcode = X86ISD::FAND; break;
45009 case ISD::OR: FPOpcode = X86ISD::FOR; break;
45010 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45011 default: return SDValue();
45012 }
45013
45014 // Check if we have a bitcast from another integer type as well.
45015 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
45016 (Subtarget.hasSSE2() && VT == MVT::f64) ||
45017 (Subtarget.hasFP16() && VT == MVT::f16) ||
45018 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
45019 TLI.isTypeLegal(VT))))
45020 return SDValue();
45021
45022 SDValue LogicOp0 = N0.getOperand(0);
45023 SDValue LogicOp1 = N0.getOperand(1);
45024 SDLoc DL0(N0);
45025
45026 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45027 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
45028 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
45029 LogicOp0.getOperand(0).getValueType() == VT &&
45030 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
45031 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
45032 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45033 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
45034 }
45035 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45036 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
45037 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
45038 LogicOp1.getOperand(0).getValueType() == VT &&
45039 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
45040 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
45041 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45042 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
45043 }
45044
45045 return SDValue();
45046}
45047
45048// (mul (zext a), (sext, b))
45049static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
45050 SDValue &Op1) {
45051 Op0 = Mul.getOperand(0);
45052 Op1 = Mul.getOperand(1);
45053
45054 // The operand1 should be signed extend
45055 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
45056 std::swap(Op0, Op1);
45057
45058 auto IsFreeTruncation = [](SDValue &Op) -> bool {
45059 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
45060 Op.getOpcode() == ISD::SIGN_EXTEND) &&
45061 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
45062 return true;
45063
45064 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
45065 return (BV && BV->isConstant());
45066 };
45067
45068 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
45069 // value, we need to check Op0 is zero extended value. Op1 should be signed
45070 // value, so we just check the signed bits.
45071 if ((IsFreeTruncation(Op0) &&
45072 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
45073 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
45074 return true;
45075
45076 return false;
45077}
45078
45079// Given a ABS node, detect the following pattern:
45080// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
45081// This is useful as it is the input into a SAD pattern.
45082static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
45083 SDValue AbsOp1 = Abs->getOperand(0);
45084 if (AbsOp1.getOpcode() != ISD::SUB)
45085 return false;
45086
45087 Op0 = AbsOp1.getOperand(0);
45088 Op1 = AbsOp1.getOperand(1);
45089
45090 // Check if the operands of the sub are zero-extended from vectors of i8.
45091 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
45092 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
45093 Op1.getOpcode() != ISD::ZERO_EXTEND ||
45094 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
45095 return false;
45096
45097 return true;
45098}
45099
45100static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
45101 unsigned &LogBias, const SDLoc &DL,
45102 const X86Subtarget &Subtarget) {
45103 // Extend or truncate to MVT::i8 first.
45104 MVT Vi8VT =
45105 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
45106 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
45107 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
45108
45109 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
45110 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
45111 // The src A, B element type is i8, but the dst C element type is i32.
45112 // When we calculate the reduce stage, we use src vector type vXi8 for it
45113 // so we need logbias 2 to avoid extra 2 stages.
45114 LogBias = 2;
45115
45116 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
45117 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
45118 RegSize = std::max(512u, RegSize);
45119
45120 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45121 // fill in the missing vector elements with 0.
45122 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
45123 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
45124 Ops[0] = LHS;
45125 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45126 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45127 Ops[0] = RHS;
45128 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45129
45130 // Actually build the DotProduct, split as 256/512 bits for
45131 // AVXVNNI/AVX512VNNI.
45132 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45133 ArrayRef<SDValue> Ops) {
45134 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45135 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
45136 };
45137 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
45138 SDValue Zero = DAG.getConstant(0, DL, DpVT);
45139
45140 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
45141 DpBuilder, false);
45142}
45143
45144// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
45145// to these zexts.
45146static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
45147 const SDValue &Zext1, const SDLoc &DL,
45148 const X86Subtarget &Subtarget) {
45149 // Find the appropriate width for the PSADBW.
45150 EVT InVT = Zext0.getOperand(0).getValueType();
45151 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
45152
45153 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45154 // fill in the missing vector elements with 0.
45155 unsigned NumConcat = RegSize / InVT.getSizeInBits();
45156 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
45157 Ops[0] = Zext0.getOperand(0);
45158 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45159 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45160 Ops[0] = Zext1.getOperand(0);
45161 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45162
45163 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45164 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45165 ArrayRef<SDValue> Ops) {
45166 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45167 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
45168 };
45169 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
45170 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
45171 PSADBWBuilder);
45172}
45173
45174// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
45175// PHMINPOSUW.
45176static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
45177 const X86Subtarget &Subtarget) {
45178 // Bail without SSE41.
45179 if (!Subtarget.hasSSE41())
45180 return SDValue();
45181
45182 EVT ExtractVT = Extract->getValueType(0);
45183 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
45184 return SDValue();
45185
45186 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
45187 ISD::NodeType BinOp;
45188 SDValue Src = DAG.matchBinOpReduction(
45189 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
45190 if (!Src)
45191 return SDValue();
45192
45193 EVT SrcVT = Src.getValueType();
45194 EVT SrcSVT = SrcVT.getScalarType();
45195 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
45196 return SDValue();
45197
45198 SDLoc DL(Extract);
45199 SDValue MinPos = Src;
45200
45201 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45202 while (SrcVT.getSizeInBits() > 128) {
45203 SDValue Lo, Hi;
45204 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
45205 SrcVT = Lo.getValueType();
45206 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
45207 }
45208 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))
45209 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))
45210 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))
;
45211
45212 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
45213 // to flip the value accordingly.
45214 SDValue Mask;
45215 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
45216 if (BinOp == ISD::SMAX)
45217 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
45218 else if (BinOp == ISD::SMIN)
45219 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
45220 else if (BinOp == ISD::UMAX)
45221 Mask = DAG.getAllOnesConstant(DL, SrcVT);
45222
45223 if (Mask)
45224 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45225
45226 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
45227 // shuffling each upper element down and insert zeros. This means that the
45228 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45229 // ready for the PHMINPOS.
45230 if (ExtractVT == MVT::i8) {
45231 SDValue Upper = DAG.getVectorShuffle(
45232 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
45233 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
45234 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
45235 }
45236
45237 // Perform the PHMINPOS on a v8i16 vector,
45238 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
45239 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
45240 MinPos = DAG.getBitcast(SrcVT, MinPos);
45241
45242 if (Mask)
45243 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45244
45245 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
45246 DAG.getIntPtrConstant(0, DL));
45247}
45248
45249// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
45250static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
45251 const X86Subtarget &Subtarget) {
45252 // Bail without SSE2.
45253 if (!Subtarget.hasSSE2())
45254 return SDValue();
45255
45256 EVT ExtractVT = Extract->getValueType(0);
45257 unsigned BitWidth = ExtractVT.getSizeInBits();
45258 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
45259 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
45260 return SDValue();
45261
45262 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
45263 ISD::NodeType BinOp;
45264 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
45265 if (!Match && ExtractVT == MVT::i1)
45266 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
45267 if (!Match)
45268 return SDValue();
45269
45270 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
45271 // which we can't support here for now.
45272 if (Match.getScalarValueSizeInBits() != BitWidth)
45273 return SDValue();
45274
45275 SDValue Movmsk;
45276 SDLoc DL(Extract);
45277 EVT MatchVT = Match.getValueType();
45278 unsigned NumElts = MatchVT.getVectorNumElements();
45279 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45280 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45281 LLVMContext &Ctx = *DAG.getContext();
45282
45283 if (ExtractVT == MVT::i1) {
45284 // Special case for (pre-legalization) vXi1 reductions.
45285 if (NumElts > 64 || !isPowerOf2_32(NumElts))
45286 return SDValue();
45287 if (Match.getOpcode() == ISD::SETCC) {
45288 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45289 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45290 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45291 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45292 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45293 X86::CondCode X86CC;
45294 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45295 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45296 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45297 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45298 DAG, X86CC))
45299 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45300 getSETCC(X86CC, V, DL, DAG));
45301 }
45302 }
45303 if (TLI.isTypeLegal(MatchVT)) {
45304 // If this is a legal AVX512 predicate type then we can just bitcast.
45305 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45306 Movmsk = DAG.getBitcast(MovmskVT, Match);
45307 } else {
45308 // Use combineBitcastvxi1 to create the MOVMSK.
45309 while (NumElts > MaxElts) {
45310 SDValue Lo, Hi;
45311 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45312 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45313 NumElts /= 2;
45314 }
45315 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45316 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45317 }
45318 if (!Movmsk)
45319 return SDValue();
45320 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45321 } else {
45322 // FIXME: Better handling of k-registers or 512-bit vectors?
45323 unsigned MatchSizeInBits = Match.getValueSizeInBits();
45324 if (!(MatchSizeInBits == 128 ||
45325 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45326 return SDValue();
45327
45328 // Make sure this isn't a vector of 1 element. The perf win from using
45329 // MOVMSK diminishes with less elements in the reduction, but it is
45330 // generally better to get the comparison over to the GPRs as soon as
45331 // possible to reduce the number of vector ops.
45332 if (Match.getValueType().getVectorNumElements() < 2)
45333 return SDValue();
45334
45335 // Check that we are extracting a reduction of all sign bits.
45336 if (DAG.ComputeNumSignBits(Match) != BitWidth)
45337 return SDValue();
45338
45339 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45340 SDValue Lo, Hi;
45341 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45342 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45343 MatchSizeInBits = Match.getValueSizeInBits();
45344 }
45345
45346 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45347 MVT MaskSrcVT;
45348 if (64 == BitWidth || 32 == BitWidth)
45349 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
45350 MatchSizeInBits / BitWidth);
45351 else
45352 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45353
45354 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45355 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45356 NumElts = MaskSrcVT.getVectorNumElements();
45357 }
45358 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__
__PRETTY_FUNCTION__))
45359 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__
__PRETTY_FUNCTION__))
;
45360
45361 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45362 if (BinOp == ISD::XOR) {
45363 // parity -> (PARITY(MOVMSK X))
45364 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45365 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45366 }
45367
45368 SDValue CmpC;
45369 ISD::CondCode CondCode;
45370 if (BinOp == ISD::OR) {
45371 // any_of -> MOVMSK != 0
45372 CmpC = DAG.getConstant(0, DL, CmpVT);
45373 CondCode = ISD::CondCode::SETNE;
45374 } else {
45375 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45376 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45377 DL, CmpVT);
45378 CondCode = ISD::CondCode::SETEQ;
45379 }
45380
45381 // The setcc produces an i8 of 0/1, so extend that to the result width and
45382 // negate to get the final 0/-1 mask value.
45383 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45384 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45385 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45386 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
45387 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
45388}
45389
45390static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
45391 const X86Subtarget &Subtarget) {
45392 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45393 return SDValue();
45394
45395 EVT ExtractVT = Extract->getValueType(0);
45396 // Verify the type we're extracting is i32, as the output element type of
45397 // vpdpbusd is i32.
45398 if (ExtractVT != MVT::i32)
45399 return SDValue();
45400
45401 EVT VT = Extract->getOperand(0).getValueType();
45402 if (!isPowerOf2_32(VT.getVectorNumElements()))
45403 return SDValue();
45404
45405 // Match shuffle + add pyramid.
45406 ISD::NodeType BinOp;
45407 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45408
45409 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45410 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45411 // before adding into the accumulator.
45412 // TODO:
45413 // We also need to verify that the multiply has at least 2x the number of bits
45414 // of the input. We shouldn't match
45415 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45416 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45417 // Root = Root.getOperand(0);
45418
45419 // If there was a match, we want Root to be a mul.
45420 if (!Root || Root.getOpcode() != ISD::MUL)
45421 return SDValue();
45422
45423 // Check whether we have an extend and mul pattern
45424 SDValue LHS, RHS;
45425 if (!detectExtMul(DAG, Root, LHS, RHS))
45426 return SDValue();
45427
45428 // Create the dot product instruction.
45429 SDLoc DL(Extract);
45430 unsigned StageBias;
45431 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45432
45433 // If the original vector was wider than 4 elements, sum over the results
45434 // in the DP vector.
45435 unsigned Stages = Log2_32(VT.getVectorNumElements());
45436 EVT DpVT = DP.getValueType();
45437
45438 if (Stages > StageBias) {
45439 unsigned DpElems = DpVT.getVectorNumElements();
45440
45441 for (unsigned i = Stages - StageBias; i > 0; --i) {
45442 SmallVector<int, 16> Mask(DpElems, -1);
45443 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45444 Mask[j] = MaskEnd + j;
45445
45446 SDValue Shuffle =
45447 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45448 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45449 }
45450 }
45451
45452 // Return the lowest ExtractSizeInBits bits.
45453 EVT ResVT =
45454 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45455 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45456 DP = DAG.getBitcast(ResVT, DP);
45457 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45458 Extract->getOperand(1));
45459}
45460
45461static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
45462 const X86Subtarget &Subtarget) {
45463 // PSADBW is only supported on SSE2 and up.
45464 if (!Subtarget.hasSSE2())
1
Taking false branch
45465 return SDValue();
45466
45467 EVT ExtractVT = Extract->getValueType(0);
45468 // Verify the type we're extracting is either i32 or i64.
45469 // FIXME: Could support other types, but this is what we have coverage for.
45470 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45471 return SDValue();
45472
45473 EVT VT = Extract->getOperand(0).getValueType();
45474 if (!isPowerOf2_32(VT.getVectorNumElements()))
2
Taking false branch
45475 return SDValue();
45476
45477 // Match shuffle + add pyramid.
45478 ISD::NodeType BinOp;
45479 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45480
45481 // The operand is expected to be zero extended from i8
45482 // (verified in detectZextAbsDiff).
45483 // In order to convert to i64 and above, additional any/zero/sign
45484 // extend is expected.
45485 // The zero extend from 32 bit has no mathematical effect on the result.
45486 // Also the sign extend is basically zero extend
45487 // (extends the sign bit which is zero).
45488 // So it is correct to skip the sign/zero extend instruction.
45489 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
3
Assuming the condition is false
45490 Root.getOpcode() == ISD::ZERO_EXTEND ||
4
Assuming the condition is false
45491 Root.getOpcode() == ISD::ANY_EXTEND))
5
Assuming the condition is false
45492 Root = Root.getOperand(0);
45493
45494 // If there was a match, we want Root to be a select that is the root of an
45495 // abs-diff pattern.
45496 if (!Root || Root.getOpcode() != ISD::ABS)
6
Assuming the condition is false
7
Taking false branch
45497 return SDValue();
45498
45499 // Check whether we have an abs-diff pattern feeding into the select.
45500 SDValue Zext0, Zext1;
45501 if (!detectZextAbsDiff(Root, Zext0, Zext1))
8
Taking false branch
45502 return SDValue();
45503
45504 // Create the SAD instruction.
45505 SDLoc DL(Extract);
45506 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45507
45508 // If the original vector was wider than 8 elements, sum over the results
45509 // in the SAD vector.
45510 unsigned Stages = Log2_32(VT.getVectorNumElements());
45511 EVT SadVT = SAD.getValueType();
45512 if (Stages
8.1
'Stages' is > 3
> 3) {
9
Taking true branch
45513 unsigned SadElems = SadVT.getVectorNumElements();
45514
45515 for(unsigned i = Stages - 3; i > 0; --i) {
10
Loop condition is true. Entering loop body
45516 SmallVector<int, 16> Mask(SadElems, -1);
45517 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
11
The result of the left shift is undefined due to shifting by '4294967291', which is greater or equal to the width of type 'int'
45518 Mask[j] = MaskEnd + j;
45519
45520 SDValue Shuffle =
45521 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45522 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45523 }
45524 }
45525
45526 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45527 // Return the lowest ExtractSizeInBits bits.
45528 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45529 SadVT.getSizeInBits() / ExtractSizeInBits);
45530 SAD = DAG.getBitcast(ResVT, SAD);
45531 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45532 Extract->getOperand(1));
45533}
45534
45535// Attempt to peek through a target shuffle and extract the scalar from the
45536// source.
45537static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
45538 TargetLowering::DAGCombinerInfo &DCI,
45539 const X86Subtarget &Subtarget) {
45540 if (DCI.isBeforeLegalizeOps())
45541 return SDValue();
45542
45543 SDLoc dl(N);
45544 SDValue Src = N->getOperand(0);
45545 SDValue Idx = N->getOperand(1);
45546
45547 EVT VT = N->getValueType(0);
45548 EVT SrcVT = Src.getValueType();
45549 EVT SrcSVT = SrcVT.getVectorElementType();
45550 unsigned SrcEltBits = SrcSVT.getSizeInBits();
45551 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45552
45553 // Don't attempt this for boolean mask vectors or unknown extraction indices.
45554 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45555 return SDValue();
45556
45557 const APInt &IdxC = N->getConstantOperandAPInt(1);
45558 if (IdxC.uge(NumSrcElts))
45559 return SDValue();
45560
45561 SDValue SrcBC = peekThroughBitcasts(Src);
45562
45563 // Handle extract(bitcast(broadcast(scalar_value))).
45564 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45565 SDValue SrcOp = SrcBC.getOperand(0);
45566 EVT SrcOpVT = SrcOp.getValueType();
45567 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45568 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45569 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45570 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45571 // TODO support non-zero offsets.
45572 if (Offset == 0) {
45573 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45574 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45575 return SrcOp;
45576 }
45577 }
45578 }
45579
45580 // If we're extracting a single element from a broadcast load and there are
45581 // no other users, just create a single load.
45582 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45583 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45584 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45585 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45586 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45587 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45588 MemIntr->getBasePtr(),
45589 MemIntr->getPointerInfo(),
45590 MemIntr->getOriginalAlign(),
45591 MemIntr->getMemOperand()->getFlags());
45592 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45593 return Load;
45594 }
45595 }
45596
45597 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45598 // TODO: Move to DAGCombine?
45599 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45600 SrcBC.getValueType().isInteger() &&
45601 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45602 SrcBC.getScalarValueSizeInBits() ==
45603 SrcBC.getOperand(0).getValueSizeInBits()) {
45604 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45605 if (IdxC.ult(Scale)) {
45606 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45607 SDValue Scl = SrcBC.getOperand(0);
45608 EVT SclVT = Scl.getValueType();
45609 if (Offset) {
45610 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
45611 DAG.getShiftAmountConstant(Offset, SclVT, dl));
45612 }
45613 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
45614 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
45615 return Scl;
45616 }
45617 }
45618
45619 // Handle extract(truncate(x)) for 0'th index.
45620 // TODO: Treat this as a faux shuffle?
45621 // TODO: When can we use this for general indices?
45622 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
45623 (SrcVT.getSizeInBits() % 128) == 0) {
45624 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
45625 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
45626 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45627 Idx);
45628 }
45629
45630 // We can only legally extract other elements from 128-bit vectors and in
45631 // certain circumstances, depending on SSE-level.
45632 // TODO: Investigate float/double extraction if it will be just stored.
45633 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
45634 unsigned Idx) {
45635 EVT VecSVT = VecVT.getScalarType();
45636 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
45637 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
45638 VecSVT == MVT::i64)) {
45639 unsigned EltSizeInBits = VecSVT.getSizeInBits();
45640 unsigned NumEltsPerLane = 128 / EltSizeInBits;
45641 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45642 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
45643 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
45644 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
45645 Idx &= (NumEltsPerLane - 1);
45646 }
45647 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
45648 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
45649 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
45650 DAG.getBitcast(VecVT, Vec),
45651 DAG.getIntPtrConstant(Idx, dl));
45652 }
45653 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
45654 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
45655 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
45656 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
45657 DAG.getTargetConstant(Idx, dl, MVT::i8));
45658 }
45659 return SDValue();
45660 };
45661
45662 // Resolve the target shuffle inputs and mask.
45663 SmallVector<int, 16> Mask;
45664 SmallVector<SDValue, 2> Ops;
45665 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
45666 return SDValue();
45667
45668 // Shuffle inputs must be the same size as the result.
45669 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
45670 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
45671 }))
45672 return SDValue();
45673
45674 // Attempt to narrow/widen the shuffle mask to the correct size.
45675 if (Mask.size() != NumSrcElts) {
45676 if ((NumSrcElts % Mask.size()) == 0) {
45677 SmallVector<int, 16> ScaledMask;
45678 int Scale = NumSrcElts / Mask.size();
45679 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
45680 Mask = std::move(ScaledMask);
45681 } else if ((Mask.size() % NumSrcElts) == 0) {
45682 // Simplify Mask based on demanded element.
45683 int ExtractIdx = (int)IdxC.getZExtValue();
45684 int Scale = Mask.size() / NumSrcElts;
45685 int Lo = Scale * ExtractIdx;
45686 int Hi = Scale * (ExtractIdx + 1);
45687 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
45688 if (i < Lo || Hi <= i)
45689 Mask[i] = SM_SentinelUndef;
45690
45691 SmallVector<int, 16> WidenedMask;
45692 while (Mask.size() > NumSrcElts &&
45693 canWidenShuffleElements(Mask, WidenedMask))
45694 Mask = std::move(WidenedMask);
45695 }
45696 }
45697
45698 // If narrowing/widening failed, see if we can extract+zero-extend.
45699 int ExtractIdx;
45700 EVT ExtractVT;
45701 if (Mask.size() == NumSrcElts) {
45702 ExtractIdx = Mask[IdxC.getZExtValue()];
45703 ExtractVT = SrcVT;
45704 } else {
45705 unsigned Scale = Mask.size() / NumSrcElts;
45706 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
45707 return SDValue();
45708 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
45709 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
45710 return SDValue();
45711 ExtractIdx = Mask[ScaledIdx];
45712 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
45713 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
45714 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__
__PRETTY_FUNCTION__))
45715 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__
__PRETTY_FUNCTION__))
;
45716 }
45717
45718 // If the shuffle source element is undef/zero then we can just accept it.
45719 if (ExtractIdx == SM_SentinelUndef)
45720 return DAG.getUNDEF(VT);
45721
45722 if (ExtractIdx == SM_SentinelZero)
45723 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
45724 : DAG.getConstant(0, dl, VT);
45725
45726 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
45727 ExtractIdx = ExtractIdx % Mask.size();
45728 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
45729 return DAG.getZExtOrTrunc(V, dl, VT);
45730
45731 return SDValue();
45732}
45733
45734/// Extracting a scalar FP value from vector element 0 is free, so extract each
45735/// operand first, then perform the math as a scalar op.
45736static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
45737 const X86Subtarget &Subtarget) {
45738 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45738, __extension__
__PRETTY_FUNCTION__))
;
45739 SDValue Vec = ExtElt->getOperand(0);
45740 SDValue Index = ExtElt->getOperand(1);
45741 EVT VT = ExtElt->getValueType(0);
45742 EVT VecVT = Vec.getValueType();
45743
45744 // TODO: If this is a unary/expensive/expand op, allow extraction from a
45745 // non-zero element because the shuffle+scalar op will be cheaper?
45746 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
45747 return SDValue();
45748
45749 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
45750 // extract, the condition code), so deal with those as a special-case.
45751 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
45752 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
45753 if (OpVT != MVT::f32 && OpVT != MVT::f64)
45754 return SDValue();
45755
45756 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
45757 SDLoc DL(ExtElt);
45758 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45759 Vec.getOperand(0), Index);
45760 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45761 Vec.getOperand(1), Index);
45762 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
45763 }
45764
45765 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
45766 VT != MVT::f64)
45767 return SDValue();
45768
45769 // Vector FP selects don't fit the pattern of FP math ops (because the
45770 // condition has a different type and we have to change the opcode), so deal
45771 // with those here.
45772 // FIXME: This is restricted to pre type legalization by ensuring the setcc
45773 // has i1 elements. If we loosen this we need to convert vector bool to a
45774 // scalar bool.
45775 if (Vec.getOpcode() == ISD::VSELECT &&
45776 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
45777 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
45778 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
45779 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
45780 SDLoc DL(ExtElt);
45781 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
45782 Vec.getOperand(0).getValueType().getScalarType(),
45783 Vec.getOperand(0), Index);
45784 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45785 Vec.getOperand(1), Index);
45786 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45787 Vec.getOperand(2), Index);
45788 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
45789 }
45790
45791 // TODO: This switch could include FNEG and the x86-specific FP logic ops
45792 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
45793 // missed load folding and fma+fneg combining.
45794 switch (Vec.getOpcode()) {
45795 case ISD::FMA: // Begin 3 operands
45796 case ISD::FMAD:
45797 case ISD::FADD: // Begin 2 operands
45798 case ISD::FSUB:
45799 case ISD::FMUL:
45800 case ISD::FDIV:
45801 case ISD::FREM:
45802 case ISD::FCOPYSIGN:
45803 case ISD::FMINNUM:
45804 case ISD::FMAXNUM:
45805 case ISD::FMINNUM_IEEE:
45806 case ISD::FMAXNUM_IEEE:
45807 case ISD::FMAXIMUM:
45808 case ISD::FMINIMUM:
45809 case X86ISD::FMAX:
45810 case X86ISD::FMIN:
45811 case ISD::FABS: // Begin 1 operand
45812 case ISD::FSQRT:
45813 case ISD::FRINT:
45814 case ISD::FCEIL:
45815 case ISD::FTRUNC:
45816 case ISD::FNEARBYINT:
45817 case ISD::FROUND:
45818 case ISD::FFLOOR:
45819 case X86ISD::FRCP:
45820 case X86ISD::FRSQRT: {
45821 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
45822 SDLoc DL(ExtElt);
45823 SmallVector<SDValue, 4> ExtOps;
45824 for (SDValue Op : Vec->ops())
45825 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
45826 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
45827 }
45828 default:
45829 return SDValue();
45830 }
45831 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45831)
;
45832}
45833
45834/// Try to convert a vector reduction sequence composed of binops and shuffles
45835/// into horizontal ops.
45836static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
45837 const X86Subtarget &Subtarget) {
45838 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45838, __extension__
__PRETTY_FUNCTION__))
;
45839
45840 // We need at least SSE2 to anything here.
45841 if (!Subtarget.hasSSE2())
45842 return SDValue();
45843
45844 ISD::NodeType Opc;
45845 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
45846 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
45847 if (!Rdx)
45848 return SDValue();
45849
45850 SDValue Index = ExtElt->getOperand(1);
45851 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__
__PRETTY_FUNCTION__))
45852 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__
__PRETTY_FUNCTION__))
;
45853
45854 EVT VT = ExtElt->getValueType(0);
45855 EVT VecVT = Rdx.getValueType();
45856 if (VecVT.getScalarType() != VT)
45857 return SDValue();
45858
45859 SDLoc DL(ExtElt);
45860 unsigned NumElts = VecVT.getVectorNumElements();
45861 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
45862
45863 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
45864 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
45865 if (V.getValueType() == MVT::v4i8) {
45866 if (ZeroExtend && Subtarget.hasSSE41()) {
45867 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
45868 DAG.getConstant(0, DL, MVT::v4i32),
45869 DAG.getBitcast(MVT::i32, V),
45870 DAG.getIntPtrConstant(0, DL));
45871 return DAG.getBitcast(MVT::v16i8, V);
45872 }
45873 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
45874 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
45875 : DAG.getUNDEF(MVT::v4i8));
45876 }
45877 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
45878 DAG.getUNDEF(MVT::v8i8));
45879 };
45880
45881 // vXi8 mul reduction - promote to vXi16 mul reduction.
45882 if (Opc == ISD::MUL) {
45883 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
45884 return SDValue();
45885 if (VecVT.getSizeInBits() >= 128) {
45886 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
45887 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45888 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45889 Lo = DAG.getBitcast(WideVT, Lo);
45890 Hi = DAG.getBitcast(WideVT, Hi);
45891 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
45892 while (Rdx.getValueSizeInBits() > 128) {
45893 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45894 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
45895 }
45896 } else {
45897 Rdx = WidenToV16I8(Rdx, false);
45898 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
45899 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
45900 }
45901 if (NumElts >= 8)
45902 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45903 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45904 {4, 5, 6, 7, -1, -1, -1, -1}));
45905 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45906 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45907 {2, 3, -1, -1, -1, -1, -1, -1}));
45908 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45909 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45910 {1, -1, -1, -1, -1, -1, -1, -1}));
45911 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45912 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45913 }
45914
45915 // vXi8 add reduction - sub 128-bit vector.
45916 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
45917 Rdx = WidenToV16I8(Rdx, true);
45918 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45919 DAG.getConstant(0, DL, MVT::v16i8));
45920 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45921 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45922 }
45923
45924 // Must be a >=128-bit vector with pow2 elements.
45925 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
45926 return SDValue();
45927
45928 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
45929 if (VT == MVT::i8) {
45930 while (Rdx.getValueSizeInBits() > 128) {
45931 SDValue Lo, Hi;
45932 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45933 VecVT = Lo.getValueType();
45934 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45935 }
45936 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45936, __extension__
__PRETTY_FUNCTION__))
;
45937
45938 SDValue Hi = DAG.getVectorShuffle(
45939 MVT::v16i8, DL, Rdx, Rdx,
45940 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
45941 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
45942 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45943 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
45944 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45946 }
45947
45948 // See if we can use vXi8 PSADBW add reduction for larger zext types.
45949 // If the source vector values are 0-255, then we can use PSADBW to
45950 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
45951 // TODO: See if its worth avoiding vXi16/i32 truncations?
45952 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
45953 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
45954 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
45955 Subtarget.hasAVX512())) {
45956 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
45957 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
45958 if (ByteVT.getSizeInBits() < 128)
45959 Rdx = WidenToV16I8(Rdx, true);
45960
45961 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45962 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45963 ArrayRef<SDValue> Ops) {
45964 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45965 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
45966 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
45967 };
45968 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
45969 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
45970
45971 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
45972 while (Rdx.getValueSizeInBits() > 128) {
45973 SDValue Lo, Hi;
45974 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45975 VecVT = Lo.getValueType();
45976 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45977 }
45978 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45978, __extension__
__PRETTY_FUNCTION__))
;
45979
45980 if (NumElts > 8) {
45981 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
45982 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
45983 }
45984
45985 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
45986 Rdx = DAG.getBitcast(VecVT, Rdx);
45987 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45988 }
45989
45990 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
45991 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
45992 return SDValue();
45993
45994 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
45995
45996 // 256-bit horizontal instructions operate on 128-bit chunks rather than
45997 // across the whole vector, so we need an extract + hop preliminary stage.
45998 // This is the only step where the operands of the hop are not the same value.
45999 // TODO: We could extend this to handle 512-bit or even longer vectors.
46000 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
46001 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
46002 unsigned NumElts = VecVT.getVectorNumElements();
46003 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
46004 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
46005 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
46006 VecVT = Rdx.getValueType();
46007 }
46008 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
46009 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
46010 return SDValue();
46011
46012 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46013 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
46014 for (unsigned i = 0; i != ReductionSteps; ++i)
46015 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
46016
46017 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46018}
46019
46020/// Detect vector gather/scatter index generation and convert it from being a
46021/// bunch of shuffles and extracts into a somewhat faster sequence.
46022/// For i686, the best sequence is apparently storing the value and loading
46023/// scalars back, while for x64 we should use 64-bit extracts and shifts.
46024static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
46025 TargetLowering::DAGCombinerInfo &DCI,
46026 const X86Subtarget &Subtarget) {
46027 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
46028 return NewOp;
46029
46030 SDValue InputVector = N->getOperand(0);
46031 SDValue EltIdx = N->getOperand(1);
46032 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
46033
46034 EVT SrcVT = InputVector.getValueType();
46035 EVT VT = N->getValueType(0);
46036 SDLoc dl(InputVector);
46037 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46038 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46039 unsigned NumEltBits = VT.getScalarSizeInBits();
46040 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46041
46042 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46043 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46044
46045 // Integer Constant Folding.
46046 if (CIdx && VT.isInteger()) {
46047 APInt UndefVecElts;
46048 SmallVector<APInt, 16> EltBits;
46049 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
46050 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
46051 EltBits, true, false)) {
46052 uint64_t Idx = CIdx->getZExtValue();
46053 if (UndefVecElts[Idx])
46054 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46055 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
46056 }
46057
46058 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46059 // Improves lowering of bool masks on rust which splits them into byte array.
46060 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
46061 SDValue Src = peekThroughBitcasts(InputVector);
46062 if (Src.getValueType().getScalarType() == MVT::i1 &&
46063 TLI.isTypeLegal(Src.getValueType())) {
46064 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
46065 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
46066 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
46067 return DAG.getBitcast(VT, Sub);
46068 }
46069 }
46070 }
46071
46072 if (IsPextr) {
46073 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
46074 DCI))
46075 return SDValue(N, 0);
46076
46077 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46078 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
46079 InputVector.getOpcode() == X86ISD::PINSRW) &&
46080 InputVector.getOperand(2) == EltIdx) {
46081 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__
__PRETTY_FUNCTION__))
46082 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__
__PRETTY_FUNCTION__))
;
46083 SDValue Scl = InputVector.getOperand(1);
46084 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
46085 return DAG.getZExtOrTrunc(Scl, dl, VT);
46086 }
46087
46088 // TODO - Remove this once we can handle the implicit zero-extension of
46089 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
46090 // combineBasicSADPattern.
46091 return SDValue();
46092 }
46093
46094 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
46095 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
46096 InputVector.getOpcode() == ISD::BITCAST &&
46097 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46098 isNullConstant(EltIdx) && InputVector.hasOneUse())
46099 return DAG.getBitcast(VT, InputVector);
46100
46101 // Detect mmx to i32 conversion through a v2i32 elt extract.
46102 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
46103 InputVector.getOpcode() == ISD::BITCAST &&
46104 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46105 isNullConstant(EltIdx) && InputVector.hasOneUse())
46106 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
46107 InputVector.getOperand(0));
46108
46109 // Check whether this extract is the root of a sum of absolute differences
46110 // pattern. This has to be done here because we really want it to happen
46111 // pre-legalization,
46112 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
46113 return SAD;
46114
46115 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
46116 return VPDPBUSD;
46117
46118 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
46119 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
46120 return Cmp;
46121
46122 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
46123 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
46124 return MinMax;
46125
46126 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
46127 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
46128 return V;
46129
46130 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
46131 return V;
46132
46133 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
46134 // and then testing the relevant element.
46135 //
46136 // Note that we only combine extracts on the *same* result number, i.e.
46137 // t0 = merge_values a0, a1, a2, a3
46138 // i1 = extract_vector_elt t0, Constant:i64<2>
46139 // i1 = extract_vector_elt t0, Constant:i64<3>
46140 // but not
46141 // i1 = extract_vector_elt t0:1, Constant:i64<2>
46142 // since the latter would need its own MOVMSK.
46143 if (SrcVT.getScalarType() == MVT::i1) {
46144 bool IsVar = !CIdx;
46145 SmallVector<SDNode *, 16> BoolExtracts;
46146 unsigned ResNo = InputVector.getResNo();
46147 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
46148 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46149 Use->getOperand(0).getResNo() == ResNo &&
46150 Use->getValueType(0) == MVT::i1) {
46151 BoolExtracts.push_back(Use);
46152 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46153 return true;
46154 }
46155 return false;
46156 };
46157 // TODO: Can we drop the oneuse check for constant extracts?
46158 if (all_of(InputVector->uses(), IsBoolExtract) &&
46159 (IsVar || BoolExtracts.size() > 1)) {
46160 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
46161 if (SDValue BC =
46162 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
46163 for (SDNode *Use : BoolExtracts) {
46164 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46165 // Mask = 1 << MaskIdx
46166 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46167 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
46168 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
46169 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
46170 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
46171 DCI.CombineTo(Use, Res);
46172 }
46173 return SDValue(N, 0);
46174 }
46175 }
46176 }
46177
46178 // If this extract is from a loaded vector value and will be used as an
46179 // integer, that requires a potentially expensive XMM -> GPR transfer.
46180 // Additionally, if we can convert to a scalar integer load, that will likely
46181 // be folded into a subsequent integer op.
46182 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
46183 // to a single-use of the loaded vector. For the reasons above, we
46184 // expect this to be profitable even if it creates an extra load.
46185 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
46186 return Use->getOpcode() == ISD::STORE ||
46187 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46188 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46189 });
46190 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
46191 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46192 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
46193 !LikelyUsedAsVector && LoadVec->isSimple()) {
46194 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46195 SDValue NewPtr =
46196 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
46197 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
46198 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46199 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46200 SDValue Load =
46201 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46202 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46203 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46204 return Load;
46205 }
46206
46207 return SDValue();
46208}
46209
46210// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
46211// This is more or less the reverse of combineBitcastvxi1.
46212static SDValue combineToExtendBoolVectorInReg(
46213 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
46214 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
46215 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
46216 Opcode != ISD::ANY_EXTEND)
46217 return SDValue();
46218 if (!DCI.isBeforeLegalizeOps())
46219 return SDValue();
46220 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46221 return SDValue();
46222
46223 EVT SVT = VT.getScalarType();
46224 EVT InSVT = N0.getValueType().getScalarType();
46225 unsigned EltSizeInBits = SVT.getSizeInBits();
46226
46227 // Input type must be extending a bool vector (bit-casted from a scalar
46228 // integer) to legal integer types.
46229 if (!VT.isVector())
46230 return SDValue();
46231 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
46232 return SDValue();
46233 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
46234 return SDValue();
46235
46236 SDValue N00 = N0.getOperand(0);
46237 EVT SclVT = N00.getValueType();
46238 if (!SclVT.isScalarInteger())
46239 return SDValue();
46240
46241 SDValue Vec;
46242 SmallVector<int> ShuffleMask;
46243 unsigned NumElts = VT.getVectorNumElements();
46244 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46244, __extension__
__PRETTY_FUNCTION__))
;
46245
46246 // Broadcast the scalar integer to the vector elements.
46247 if (NumElts > EltSizeInBits) {
46248 // If the scalar integer is greater than the vector element size, then we
46249 // must split it down into sub-sections for broadcasting. For example:
46250 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46251 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46252 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46252, __extension__
__PRETTY_FUNCTION__))
;
46253 unsigned Scale = NumElts / EltSizeInBits;
46254 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46255 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46256 Vec = DAG.getBitcast(VT, Vec);
46257
46258 for (unsigned i = 0; i != Scale; ++i)
46259 ShuffleMask.append(EltSizeInBits, i);
46260 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46261 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46262 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46263 // If we have register broadcast instructions, use the scalar size as the
46264 // element type for the shuffle. Then cast to the wider element type. The
46265 // widened bits won't be used, and this might allow the use of a broadcast
46266 // load.
46267 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46267, __extension__
__PRETTY_FUNCTION__))
;
46268 unsigned Scale = EltSizeInBits / NumElts;
46269 EVT BroadcastVT =
46270 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
46271 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46272 ShuffleMask.append(NumElts * Scale, 0);
46273 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
46274 Vec = DAG.getBitcast(VT, Vec);
46275 } else {
46276 // For smaller scalar integers, we can simply any-extend it to the vector
46277 // element size (we don't care about the upper bits) and broadcast it to all
46278 // elements.
46279 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
46280 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
46281 ShuffleMask.append(NumElts, 0);
46282 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46283 }
46284
46285 // Now, mask the relevant bit in each element.
46286 SmallVector<SDValue, 32> Bits;
46287 for (unsigned i = 0; i != NumElts; ++i) {
46288 int BitIdx = (i % EltSizeInBits);
46289 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46290 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46291 }
46292 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46293 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46294
46295 // Compare against the bitmask and extend the result.
46296 EVT CCVT = VT.changeVectorElementType(MVT::i1);
46297 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46298 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46299
46300 // For SEXT, this is now done, otherwise shift the result down for
46301 // zero-extension.
46302 if (Opcode == ISD::SIGN_EXTEND)
46303 return Vec;
46304 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46305 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46306}
46307
46308/// If a vector select has an operand that is -1 or 0, try to simplify the
46309/// select to a bitwise logic operation.
46310/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46311static SDValue
46312combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
46313 TargetLowering::DAGCombinerInfo &DCI,
46314 const X86Subtarget &Subtarget) {
46315 SDValue Cond = N->getOperand(0);
46316 SDValue LHS = N->getOperand(1);
46317 SDValue RHS = N->getOperand(2);
46318 EVT VT = LHS.getValueType();
46319 EVT CondVT = Cond.getValueType();
46320 SDLoc DL(N);
46321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46322
46323 if (N->getOpcode() != ISD::VSELECT)
46324 return SDValue();
46325
46326 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46326, __extension__
__PRETTY_FUNCTION__))
;
46327
46328 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46329 // TODO: Can we assert that both operands are not zeros (because that should
46330 // get simplified at node creation time)?
46331 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46332 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46333
46334 // If both inputs are 0/undef, create a complete zero vector.
46335 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46336 if (TValIsAllZeros && FValIsAllZeros) {
46337 if (VT.isFloatingPoint())
46338 return DAG.getConstantFP(0.0, DL, VT);
46339 return DAG.getConstant(0, DL, VT);
46340 }
46341
46342 // To use the condition operand as a bitwise mask, it must have elements that
46343 // are the same size as the select elements. Ie, the condition operand must
46344 // have already been promoted from the IR select condition type <N x i1>.
46345 // Don't check if the types themselves are equal because that excludes
46346 // vector floating-point selects.
46347 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46348 return SDValue();
46349
46350 // Try to invert the condition if true value is not all 1s and false value is
46351 // not all 0s. Only do this if the condition has one use.
46352 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46353 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46354 // Check if the selector will be produced by CMPP*/PCMP*.
46355 Cond.getOpcode() == ISD::SETCC &&
46356 // Check if SETCC has already been promoted.
46357 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46358 CondVT) {
46359 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46360
46361 if (TValIsAllZeros || FValIsAllOnes) {
46362 SDValue CC = Cond.getOperand(2);
46363 ISD::CondCode NewCC = ISD::getSetCCInverse(
46364 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46365 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46366 NewCC);
46367 std::swap(LHS, RHS);
46368 TValIsAllOnes = FValIsAllOnes;
46369 FValIsAllZeros = TValIsAllZeros;
46370 }
46371 }
46372
46373 // Cond value must be 'sign splat' to be converted to a logical op.
46374 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46375 return SDValue();
46376
46377 // vselect Cond, 111..., 000... -> Cond
46378 if (TValIsAllOnes && FValIsAllZeros)
46379 return DAG.getBitcast(VT, Cond);
46380
46381 if (!TLI.isTypeLegal(CondVT))
46382 return SDValue();
46383
46384 // vselect Cond, 111..., X -> or Cond, X
46385 if (TValIsAllOnes) {
46386 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46387 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46388 return DAG.getBitcast(VT, Or);
46389 }
46390
46391 // vselect Cond, X, 000... -> and Cond, X
46392 if (FValIsAllZeros) {
46393 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46394 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46395 return DAG.getBitcast(VT, And);
46396 }
46397
46398 // vselect Cond, 000..., X -> andn Cond, X
46399 if (TValIsAllZeros) {
46400 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46401 SDValue AndN;
46402 // The canonical form differs for i1 vectors - x86andnp is not used
46403 if (CondVT.getScalarType() == MVT::i1)
46404 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46405 CastRHS);
46406 else
46407 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46408 return DAG.getBitcast(VT, AndN);
46409 }
46410
46411 return SDValue();
46412}
46413
46414/// If both arms of a vector select are concatenated vectors, split the select,
46415/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46416/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46417/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46418static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
46419 const X86Subtarget &Subtarget) {
46420 unsigned Opcode = N->getOpcode();
46421 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46422 return SDValue();
46423
46424 // TODO: Split 512-bit vectors too?
46425 EVT VT = N->getValueType(0);
46426 if (!VT.is256BitVector())
46427 return SDValue();
46428
46429 // TODO: Split as long as any 2 of the 3 operands are concatenated?
46430 SDValue Cond = N->getOperand(0);
46431 SDValue TVal = N->getOperand(1);
46432 SDValue FVal = N->getOperand(2);
46433 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
46434 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46435 !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
46436 !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
46437 return SDValue();
46438
46439 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46440 ArrayRef<SDValue> Ops) {
46441 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46442 };
46443 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
46444 makeBlend, /*CheckBWI*/ false);
46445}
46446
46447static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
46448 SDValue Cond = N->getOperand(0);
46449 SDValue LHS = N->getOperand(1);
46450 SDValue RHS = N->getOperand(2);
46451 SDLoc DL(N);
46452
46453 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46454 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46455 if (!TrueC || !FalseC)
46456 return SDValue();
46457
46458 // Don't do this for crazy integer types.
46459 EVT VT = N->getValueType(0);
46460 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46461 return SDValue();
46462
46463 // We're going to use the condition bit in math or logic ops. We could allow
46464 // this with a wider condition value (post-legalization it becomes an i8),
46465 // but if nothing is creating selects that late, it doesn't matter.
46466 if (Cond.getValueType() != MVT::i1)
46467 return SDValue();
46468
46469 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46470 // 3, 5, or 9 with i32/i64, so those get transformed too.
46471 // TODO: For constants that overflow or do not differ by power-of-2 or small
46472 // multiplier, convert to 'and' + 'add'.
46473 const APInt &TrueVal = TrueC->getAPIntValue();
46474 const APInt &FalseVal = FalseC->getAPIntValue();
46475
46476 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46477 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46478 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46479 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46480 if (CC == ISD::SETEQ || CC == ISD::SETNE)
46481 return SDValue();
46482 }
46483
46484 bool OV;
46485 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46486 if (OV)
46487 return SDValue();
46488
46489 APInt AbsDiff = Diff.abs();
46490 if (AbsDiff.isPowerOf2() ||
46491 ((VT == MVT::i32 || VT == MVT::i64) &&
46492 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46493
46494 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46495 // of the condition can usually be folded into a compare predicate, but even
46496 // without that, the sequence should be cheaper than a CMOV alternative.
46497 if (TrueVal.slt(FalseVal)) {
46498 Cond = DAG.getNOT(DL, Cond, MVT::i1);
46499 std::swap(TrueC, FalseC);
46500 }
46501
46502 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46503 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46504
46505 // Multiply condition by the difference if non-one.
46506 if (!AbsDiff.isOne())
46507 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46508
46509 // Add the base if non-zero.
46510 if (!FalseC->isZero())
46511 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46512
46513 return R;
46514 }
46515
46516 return SDValue();
46517}
46518
46519/// If this is a *dynamic* select (non-constant condition) and we can match
46520/// this node with one of the variable blend instructions, restructure the
46521/// condition so that blends can use the high (sign) bit of each element.
46522/// This function will also call SimplifyDemandedBits on already created
46523/// BLENDV to perform additional simplifications.
46524static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
46525 TargetLowering::DAGCombinerInfo &DCI,
46526 const X86Subtarget &Subtarget) {
46527 SDValue Cond = N->getOperand(0);
46528 if ((N->getOpcode() != ISD::VSELECT &&
46529 N->getOpcode() != X86ISD::BLENDV) ||
46530 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
46531 return SDValue();
46532
46533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46534 unsigned BitWidth = Cond.getScalarValueSizeInBits();
46535 EVT VT = N->getValueType(0);
46536
46537 // We can only handle the cases where VSELECT is directly legal on the
46538 // subtarget. We custom lower VSELECT nodes with constant conditions and
46539 // this makes it hard to see whether a dynamic VSELECT will correctly
46540 // lower, so we both check the operation's status and explicitly handle the
46541 // cases where a *dynamic* blend will fail even though a constant-condition
46542 // blend could be custom lowered.
46543 // FIXME: We should find a better way to handle this class of problems.
46544 // Potentially, we should combine constant-condition vselect nodes
46545 // pre-legalization into shuffles and not mark as many types as custom
46546 // lowered.
46547 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
46548 return SDValue();
46549 // FIXME: We don't support i16-element blends currently. We could and
46550 // should support them by making *all* the bits in the condition be set
46551 // rather than just the high bit and using an i8-element blend.
46552 if (VT.getVectorElementType() == MVT::i16)
46553 return SDValue();
46554 // Dynamic blending was only available from SSE4.1 onward.
46555 if (VT.is128BitVector() && !Subtarget.hasSSE41())
46556 return SDValue();
46557 // Byte blends are only available in AVX2
46558 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46559 return SDValue();
46560 // There are no 512-bit blend instructions that use sign bits.
46561 if (VT.is512BitVector())
46562 return SDValue();
46563
46564 // Don't optimize before the condition has been transformed to a legal type
46565 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46566 if (BitWidth < 8 || BitWidth > 64)
46567 return SDValue();
46568
46569 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46570 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
46571 UI != UE; ++UI)
46572 if ((UI->getOpcode() != ISD::VSELECT &&
46573 UI->getOpcode() != X86ISD::BLENDV) ||
46574 UI.getOperandNo() != 0)
46575 return false;
46576
46577 return true;
46578 };
46579
46580 APInt DemandedBits(APInt::getSignMask(BitWidth));
46581
46582 if (OnlyUsedAsSelectCond(Cond)) {
46583 KnownBits Known;
46584 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
46585 !DCI.isBeforeLegalizeOps());
46586 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46587 return SDValue();
46588
46589 // If we changed the computation somewhere in the DAG, this change will
46590 // affect all users of Cond. Update all the nodes so that we do not use
46591 // the generic VSELECT anymore. Otherwise, we may perform wrong
46592 // optimizations as we messed with the actual expectation for the vector
46593 // boolean values.
46594 for (SDNode *U : Cond->uses()) {
46595 if (U->getOpcode() == X86ISD::BLENDV)
46596 continue;
46597
46598 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46599 Cond, U->getOperand(1), U->getOperand(2));
46600 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46601 DCI.AddToWorklist(U);
46602 }
46603 DCI.CommitTargetLoweringOpt(TLO);
46604 return SDValue(N, 0);
46605 }
46606
46607 // Otherwise we can still at least try to simplify multiple use bits.
46608 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
46609 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
46610 N->getOperand(1), N->getOperand(2));
46611
46612 return SDValue();
46613}
46614
46615// Try to match:
46616// (or (and (M, (sub 0, X)), (pandn M, X)))
46617// which is a special case of:
46618// (select M, (sub 0, X), X)
46619// Per:
46620// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
46621// We know that, if fNegate is 0 or 1:
46622// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46623//
46624// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
46625// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46626// ( M ? -X : X) == ((X ^ M ) + (M & 1))
46627// This lets us transform our vselect to:
46628// (add (xor X, M), (and M, 1))
46629// And further to:
46630// (sub (xor X, M), M)
46631static SDValue combineLogicBlendIntoConditionalNegate(
46632 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
46633 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46634 EVT MaskVT = Mask.getValueType();
46635 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))
46636 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))
46637 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))
;
46638
46639 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
46640 return SDValue();
46641 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
46642 return SDValue();
46643
46644 auto IsNegV = [](SDNode *N, SDValue V) {
46645 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46646 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46647 };
46648
46649 SDValue V;
46650 if (IsNegV(Y.getNode(), X))
46651 V = X;
46652 else if (IsNegV(X.getNode(), Y))
46653 V = Y;
46654 else
46655 return SDValue();
46656
46657 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
46658 SDValue SubOp2 = Mask;
46659
46660 // If the negate was on the false side of the select, then
46661 // the operands of the SUB need to be swapped. PR 27251.
46662 // This is because the pattern being matched above is
46663 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
46664 // but if the pattern matched was
46665 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
46666 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
46667 // pattern also needs to be a negation of the replacement pattern above.
46668 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
46669 // sub accomplishes the negation of the replacement pattern.
46670 if (V == Y)
46671 std::swap(SubOp1, SubOp2);
46672
46673 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
46674 return DAG.getBitcast(VT, Res);
46675}
46676
46677/// Do target-specific dag combines on SELECT and VSELECT nodes.
46678static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
46679 TargetLowering::DAGCombinerInfo &DCI,
46680 const X86Subtarget &Subtarget) {
46681 SDLoc DL(N);
46682 SDValue Cond = N->getOperand(0);
46683 SDValue LHS = N->getOperand(1);
46684 SDValue RHS = N->getOperand(2);
46685
46686 // Try simplification again because we use this function to optimize
46687 // BLENDV nodes that are not handled by the generic combiner.
46688 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
46689 return V;
46690
46691 EVT VT = LHS.getValueType();
46692 EVT CondVT = Cond.getValueType();
46693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46694 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
46695
46696 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
46697 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
46698 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
46699 if (CondVT.isVector() && CondVT.isInteger() &&
46700 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
46701 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
46702 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
46703 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
46704 DL, DAG, Subtarget))
46705 return V;
46706
46707 // Convert vselects with constant condition into shuffles.
46708 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
46709 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
46710 SmallVector<int, 64> Mask;
46711 if (createShuffleMaskFromVSELECT(Mask, Cond,
46712 N->getOpcode() == X86ISD::BLENDV))
46713 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
46714 }
46715
46716 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
46717 // by forcing the unselected elements to zero.
46718 // TODO: Can we handle more shuffles with this?
46719 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
46720 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
46721 LHS.hasOneUse() && RHS.hasOneUse()) {
46722 MVT SimpleVT = VT.getSimpleVT();
46723 SmallVector<SDValue, 1> LHSOps, RHSOps;
46724 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
46725 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
46726 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
46727 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
46728 int NumElts = VT.getVectorNumElements();
46729 for (int i = 0; i != NumElts; ++i) {
46730 // getConstVector sets negative shuffle mask values as undef, so ensure
46731 // we hardcode SM_SentinelZero values to zero (0x80).
46732 if (CondMask[i] < NumElts) {
46733 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
46734 RHSMask[i] = 0x80;
46735 } else {
46736 LHSMask[i] = 0x80;
46737 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
46738 }
46739 }
46740 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
46741 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
46742 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
46743 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
46744 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
46745 }
46746 }
46747
46748 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
46749 // instructions match the semantics of the common C idiom x<y?x:y but not
46750 // x<=y?x:y, because of how they handle negative zero (which can be
46751 // ignored in unsafe-math mode).
46752 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
46753 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
46754 VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
46755 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
46756 (Subtarget.hasSSE2() ||
46757 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
46758 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46759
46760 unsigned Opcode = 0;
46761 // Check for x CC y ? x : y.
46762 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
46763 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
46764 switch (CC) {
46765 default: break;
46766 case ISD::SETULT:
46767 // Converting this to a min would handle NaNs incorrectly, and swapping
46768 // the operands would cause it to handle comparisons between positive
46769 // and negative zero incorrectly.
46770 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46771 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46772 !(DAG.isKnownNeverZeroFloat(LHS) ||
46773 DAG.isKnownNeverZeroFloat(RHS)))
46774 break;
46775 std::swap(LHS, RHS);
46776 }
46777 Opcode = X86ISD::FMIN;
46778 break;
46779 case ISD::SETOLE:
46780 // Converting this to a min would handle comparisons between positive
46781 // and negative zero incorrectly.
46782 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46783 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46784 break;
46785 Opcode = X86ISD::FMIN;
46786 break;
46787 case ISD::SETULE:
46788 // Converting this to a min would handle both negative zeros and NaNs
46789 // incorrectly, but we can swap the operands to fix both.
46790 std::swap(LHS, RHS);
46791 [[fallthrough]];
46792 case ISD::SETOLT:
46793 case ISD::SETLT:
46794 case ISD::SETLE:
46795 Opcode = X86ISD::FMIN;
46796 break;
46797
46798 case ISD::SETOGE:
46799 // Converting this to a max would handle comparisons between positive
46800 // and negative zero incorrectly.
46801 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46802 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46803 break;
46804 Opcode = X86ISD::FMAX;
46805 break;
46806 case ISD::SETUGT:
46807 // Converting this to a max would handle NaNs incorrectly, and swapping
46808 // the operands would cause it to handle comparisons between positive
46809 // and negative zero incorrectly.
46810 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46811 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46812 !(DAG.isKnownNeverZeroFloat(LHS) ||
46813 DAG.isKnownNeverZeroFloat(RHS)))
46814 break;
46815 std::swap(LHS, RHS);
46816 }
46817 Opcode = X86ISD::FMAX;
46818 break;
46819 case ISD::SETUGE:
46820 // Converting this to a max would handle both negative zeros and NaNs
46821 // incorrectly, but we can swap the operands to fix both.
46822 std::swap(LHS, RHS);
46823 [[fallthrough]];
46824 case ISD::SETOGT:
46825 case ISD::SETGT:
46826 case ISD::SETGE:
46827 Opcode = X86ISD::FMAX;
46828 break;
46829 }
46830 // Check for x CC y ? y : x -- a min/max with reversed arms.
46831 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
46832 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
46833 switch (CC) {
46834 default: break;
46835 case ISD::SETOGE:
46836 // Converting this to a min would handle comparisons between positive
46837 // and negative zero incorrectly, and swapping the operands would
46838 // cause it to handle NaNs incorrectly.
46839 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46840 !(DAG.isKnownNeverZeroFloat(LHS) ||
46841 DAG.isKnownNeverZeroFloat(RHS))) {
46842 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46843 break;
46844 std::swap(LHS, RHS);
46845 }
46846 Opcode = X86ISD::FMIN;
46847 break;
46848 case ISD::SETUGT:
46849 // Converting this to a min would handle NaNs incorrectly.
46850 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46851 break;
46852 Opcode = X86ISD::FMIN;
46853 break;
46854 case ISD::SETUGE:
46855 // Converting this to a min would handle both negative zeros and NaNs
46856 // incorrectly, but we can swap the operands to fix both.
46857 std::swap(LHS, RHS);
46858 [[fallthrough]];
46859 case ISD::SETOGT:
46860 case ISD::SETGT:
46861 case ISD::SETGE:
46862 Opcode = X86ISD::FMIN;
46863 break;
46864
46865 case ISD::SETULT:
46866 // Converting this to a max would handle NaNs incorrectly.
46867 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46868 break;
46869 Opcode = X86ISD::FMAX;
46870 break;
46871 case ISD::SETOLE:
46872 // Converting this to a max would handle comparisons between positive
46873 // and negative zero incorrectly, and swapping the operands would
46874 // cause it to handle NaNs incorrectly.
46875 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46876 !DAG.isKnownNeverZeroFloat(LHS) &&
46877 !DAG.isKnownNeverZeroFloat(RHS)) {
46878 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46879 break;
46880 std::swap(LHS, RHS);
46881 }
46882 Opcode = X86ISD::FMAX;
46883 break;
46884 case ISD::SETULE:
46885 // Converting this to a max would handle both negative zeros and NaNs
46886 // incorrectly, but we can swap the operands to fix both.
46887 std::swap(LHS, RHS);
46888 [[fallthrough]];
46889 case ISD::SETOLT:
46890 case ISD::SETLT:
46891 case ISD::SETLE:
46892 Opcode = X86ISD::FMAX;
46893 break;
46894 }
46895 }
46896
46897 if (Opcode)
46898 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
46899 }
46900
46901 // Some mask scalar intrinsics rely on checking if only one bit is set
46902 // and implement it in C code like this:
46903 // A[0] = (U & 1) ? A[0] : W[0];
46904 // This creates some redundant instructions that break pattern matching.
46905 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
46906 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
46907 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
46908 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46909 SDValue AndNode = Cond.getOperand(0);
46910 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
46911 isNullConstant(Cond.getOperand(1)) &&
46912 isOneConstant(AndNode.getOperand(1))) {
46913 // LHS and RHS swapped due to
46914 // setcc outputting 1 when AND resulted in 0 and vice versa.
46915 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
46916 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
46917 }
46918 }
46919
46920 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
46921 // lowering on KNL. In this case we convert it to
46922 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
46923 // The same situation all vectors of i8 and i16 without BWI.
46924 // Make sure we extend these even before type legalization gets a chance to
46925 // split wide vectors.
46926 // Since SKX these selects have a proper lowering.
46927 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
46928 CondVT.getVectorElementType() == MVT::i1 &&
46929 (VT.getVectorElementType() == MVT::i8 ||
46930 VT.getVectorElementType() == MVT::i16)) {
46931 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
46932 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
46933 }
46934
46935 // AVX512 - Extend select with zero to merge with target shuffle.
46936 // select(mask, extract_subvector(shuffle(x)), zero) -->
46937 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
46938 // TODO - support non target shuffles as well.
46939 if (Subtarget.hasAVX512() && CondVT.isVector() &&
46940 CondVT.getVectorElementType() == MVT::i1) {
46941 auto SelectableOp = [&TLI](SDValue Op) {
46942 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46943 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
46944 isNullConstant(Op.getOperand(1)) &&
46945 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
46946 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
46947 };
46948
46949 bool SelectableLHS = SelectableOp(LHS);
46950 bool SelectableRHS = SelectableOp(RHS);
46951 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
46952 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
46953
46954 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
46955 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
46956 : RHS.getOperand(0).getValueType();
46957 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
46958 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
46959 VT.getSizeInBits());
46960 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
46961 VT.getSizeInBits());
46962 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
46963 DAG.getUNDEF(SrcCondVT), Cond,
46964 DAG.getIntPtrConstant(0, DL));
46965 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
46966 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
46967 }
46968 }
46969
46970 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
46971 return V;
46972
46973 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
46974 Cond.hasOneUse()) {
46975 EVT CondVT = Cond.getValueType();
46976 SDValue Cond0 = Cond.getOperand(0);
46977 SDValue Cond1 = Cond.getOperand(1);
46978 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46979
46980 // Canonicalize min/max:
46981 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
46982 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
46983 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
46984 // the need for an extra compare against zero. e.g.
46985 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
46986 // subl %esi, %edi
46987 // testl %edi, %edi
46988 // movl $0, %eax
46989 // cmovgl %edi, %eax
46990 // =>
46991 // xorl %eax, %eax
46992 // subl %esi, $edi
46993 // cmovsl %eax, %edi
46994 //
46995 // We can also canonicalize
46996 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
46997 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
46998 // This allows the use of a test instruction for the compare.
46999 if (LHS == Cond0 && RHS == Cond1) {
47000 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
47001 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
47002 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
47003 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47004 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47005 }
47006 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
47007 ISD::CondCode NewCC = ISD::SETUGE;
47008 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47009 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47010 }
47011 }
47012
47013 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
47014 // fold eq + gt/lt nested selects into ge/le selects
47015 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
47016 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47017 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
47018 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47019 // .. etc ..
47020 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
47021 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
47022 SDValue InnerSetCC = RHS.getOperand(0);
47023 ISD::CondCode InnerCC =
47024 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47025 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
47026 Cond0 == InnerSetCC.getOperand(0) &&
47027 Cond1 == InnerSetCC.getOperand(1)) {
47028 ISD::CondCode NewCC;
47029 switch (CC == ISD::SETEQ ? InnerCC : CC) {
47030 case ISD::SETGT: NewCC = ISD::SETGE; break;
47031 case ISD::SETLT: NewCC = ISD::SETLE; break;
47032 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
47033 case ISD::SETULT: NewCC = ISD::SETULE; break;
47034 default: NewCC = ISD::SETCC_INVALID; break;
47035 }
47036 if (NewCC != ISD::SETCC_INVALID) {
47037 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
47038 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
47039 }
47040 }
47041 }
47042 }
47043
47044 // Check if the first operand is all zeros and Cond type is vXi1.
47045 // If this an avx512 target we can improve the use of zero masking by
47046 // swapping the operands and inverting the condition.
47047 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47048 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
47049 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
47050 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
47051 // Invert the cond to not(cond) : xor(op,allones)=not(op)
47052 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
47053 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
47054 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
47055 }
47056
47057 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
47058 // get split by legalization.
47059 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47060 CondVT.getVectorElementType() == MVT::i1 &&
47061 TLI.isTypeLegal(VT.getScalarType())) {
47062 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
47063 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
47064 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
47065 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
47066 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
47067 }
47068 }
47069
47070 // Early exit check
47071 if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
47072 return SDValue();
47073
47074 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
47075 return V;
47076
47077 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
47078 return V;
47079
47080 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
47081 return V;
47082
47083 // select(~Cond, X, Y) -> select(Cond, Y, X)
47084 if (CondVT.getScalarType() != MVT::i1) {
47085 if (SDValue CondNot = IsNOT(Cond, DAG))
47086 return DAG.getNode(N->getOpcode(), DL, VT,
47087 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
47088
47089 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47090 // signbit.
47091 if (Cond.getOpcode() == X86ISD::PCMPGT &&
47092 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
47093 Cond.hasOneUse()) {
47094 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
47095 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
47096 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47097 }
47098 }
47099
47100 // Try to optimize vXi1 selects if both operands are either all constants or
47101 // bitcasts from scalar integer type. In that case we can convert the operands
47102 // to integer and use an integer select which will be converted to a CMOV.
47103 // We need to take a little bit of care to avoid creating an i64 type after
47104 // type legalization.
47105 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47106 VT.getVectorElementType() == MVT::i1 &&
47107 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
47108 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
47109 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
47110 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
47111 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
47112
47113 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
47114 LHS.getOperand(0).getValueType() == IntVT)) &&
47115 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
47116 RHS.getOperand(0).getValueType() == IntVT))) {
47117 if (LHSIsConst)
47118 LHS = combinevXi1ConstantToInteger(LHS, DAG);
47119 else
47120 LHS = LHS.getOperand(0);
47121
47122 if (RHSIsConst)
47123 RHS = combinevXi1ConstantToInteger(RHS, DAG);
47124 else
47125 RHS = RHS.getOperand(0);
47126
47127 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
47128 return DAG.getBitcast(VT, Select);
47129 }
47130 }
47131 }
47132
47133 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
47134 // single bits, then invert the predicate and swap the select operands.
47135 // This can lower using a vector shift bit-hack rather than mask and compare.
47136 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
47137 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47138 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
47139 Cond.getOperand(0).getOpcode() == ISD::AND &&
47140 isNullOrNullSplat(Cond.getOperand(1)) &&
47141 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47142 Cond.getOperand(0).getValueType() == VT) {
47143 // The 'and' mask must be composed of power-of-2 constants.
47144 SDValue And = Cond.getOperand(0);
47145 auto *C = isConstOrConstSplat(And.getOperand(1));
47146 if (C && C->getAPIntValue().isPowerOf2()) {
47147 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47148 SDValue NotCond =
47149 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
47150 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
47151 }
47152
47153 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47154 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47155 // 16-bit lacks a proper blendv.
47156 unsigned EltBitWidth = VT.getScalarSizeInBits();
47157 bool CanShiftBlend =
47158 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
47159 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
47160 (Subtarget.hasXOP()));
47161 if (CanShiftBlend &&
47162 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
47163 return C->getAPIntValue().isPowerOf2();
47164 })) {
47165 // Create a left-shift constant to get the mask bits over to the sign-bit.
47166 SDValue Mask = And.getOperand(1);
47167 SmallVector<int, 32> ShlVals;
47168 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
47169 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
47170 ShlVals.push_back(EltBitWidth - 1 -
47171 MaskVal->getAPIntValue().exactLogBase2());
47172 }
47173 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47174 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
47175 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
47176 SDValue NewCond =
47177 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
47178 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
47179 }
47180 }
47181
47182 return SDValue();
47183}
47184
47185/// Combine:
47186/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
47187/// to:
47188/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
47189/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
47190/// Note that this is only legal for some op/cc combinations.
47191static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
47192 SelectionDAG &DAG,
47193 const X86Subtarget &Subtarget) {
47194 // This combine only operates on CMP-like nodes.
47195 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47196 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47197 return SDValue();
47198
47199 // Can't replace the cmp if it has more uses than the one we're looking at.
47200 // FIXME: We would like to be able to handle this, but would need to make sure
47201 // all uses were updated.
47202 if (!Cmp.hasOneUse())
47203 return SDValue();
47204
47205 // This only applies to variations of the common case:
47206 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47207 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47208 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47209 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47210 // Using the proper condcodes (see below), overflow is checked for.
47211
47212 // FIXME: We can generalize both constraints:
47213 // - XOR/OR/AND (if they were made to survive AtomicExpand)
47214 // - LHS != 1
47215 // if the result is compared.
47216
47217 SDValue CmpLHS = Cmp.getOperand(0);
47218 SDValue CmpRHS = Cmp.getOperand(1);
47219 EVT CmpVT = CmpLHS.getValueType();
47220
47221 if (!CmpLHS.hasOneUse())
47222 return SDValue();
47223
47224 unsigned Opc = CmpLHS.getOpcode();
47225 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
47226 return SDValue();
47227
47228 SDValue OpRHS = CmpLHS.getOperand(2);
47229 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
47230 if (!OpRHSC)
47231 return SDValue();
47232
47233 APInt Addend = OpRHSC->getAPIntValue();
47234 if (Opc == ISD::ATOMIC_LOAD_SUB)
47235 Addend = -Addend;
47236
47237 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
47238 if (!CmpRHSC)
47239 return SDValue();
47240
47241 APInt Comparison = CmpRHSC->getAPIntValue();
47242 APInt NegAddend = -Addend;
47243
47244 // See if we can adjust the CC to make the comparison match the negated
47245 // addend.
47246 if (Comparison != NegAddend) {
47247 APInt IncComparison = Comparison + 1;
47248 if (IncComparison == NegAddend) {
47249 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
47250 Comparison = IncComparison;
47251 CC = X86::COND_AE;
47252 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
47253 Comparison = IncComparison;
47254 CC = X86::COND_L;
47255 }
47256 }
47257 APInt DecComparison = Comparison - 1;
47258 if (DecComparison == NegAddend) {
47259 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
47260 Comparison = DecComparison;
47261 CC = X86::COND_A;
47262 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
47263 Comparison = DecComparison;
47264 CC = X86::COND_LE;
47265 }
47266 }
47267 }
47268
47269 // If the addend is the negation of the comparison value, then we can do
47270 // a full comparison by emitting the atomic arithmetic as a locked sub.
47271 if (Comparison == NegAddend) {
47272 // The CC is fine, but we need to rewrite the LHS of the comparison as an
47273 // atomic sub.
47274 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
47275 auto AtomicSub = DAG.getAtomic(
47276 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
47277 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
47278 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
47279 AN->getMemOperand());
47280 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47281 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47282 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47283 return LockOp;
47284 }
47285
47286 // We can handle comparisons with zero in a number of cases by manipulating
47287 // the CC used.
47288 if (!Comparison.isZero())
47289 return SDValue();
47290
47291 if (CC == X86::COND_S && Addend == 1)
47292 CC = X86::COND_LE;
47293 else if (CC == X86::COND_NS && Addend == 1)
47294 CC = X86::COND_G;
47295 else if (CC == X86::COND_G && Addend == -1)
47296 CC = X86::COND_GE;
47297 else if (CC == X86::COND_LE && Addend == -1)
47298 CC = X86::COND_L;
47299 else
47300 return SDValue();
47301
47302 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47303 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47304 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47305 return LockOp;
47306}
47307
47308// Check whether a boolean test is testing a boolean value generated by
47309// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47310// code.
47311//
47312// Simplify the following patterns:
47313// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47314// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47315// to (Op EFLAGS Cond)
47316//
47317// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47318// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47319// to (Op EFLAGS !Cond)
47320//
47321// where Op could be BRCOND or CMOV.
47322//
47323static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
47324 // This combine only operates on CMP-like nodes.
47325 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47326 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47327 return SDValue();
47328
47329 // Quit if not used as a boolean value.
47330 if (CC != X86::COND_E && CC != X86::COND_NE)
47331 return SDValue();
47332
47333 // Check CMP operands. One of them should be 0 or 1 and the other should be
47334 // an SetCC or extended from it.
47335 SDValue Op1 = Cmp.getOperand(0);
47336 SDValue Op2 = Cmp.getOperand(1);
47337
47338 SDValue SetCC;
47339 const ConstantSDNode* C = nullptr;
47340 bool needOppositeCond = (CC == X86::COND_E);
47341 bool checkAgainstTrue = false; // Is it a comparison against 1?
47342
47343 if ((C = dyn_cast<ConstantSDNode>(Op1)))
47344 SetCC = Op2;
47345 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47346 SetCC = Op1;
47347 else // Quit if all operands are not constants.
47348 return SDValue();
47349
47350 if (C->getZExtValue() == 1) {
47351 needOppositeCond = !needOppositeCond;
47352 checkAgainstTrue = true;
47353 } else if (C->getZExtValue() != 0)
47354 // Quit if the constant is neither 0 or 1.
47355 return SDValue();
47356
47357 bool truncatedToBoolWithAnd = false;
47358 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47359 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47360 SetCC.getOpcode() == ISD::TRUNCATE ||
47361 SetCC.getOpcode() == ISD::AND) {
47362 if (SetCC.getOpcode() == ISD::AND) {
47363 int OpIdx = -1;
47364 if (isOneConstant(SetCC.getOperand(0)))
47365 OpIdx = 1;
47366 if (isOneConstant(SetCC.getOperand(1)))
47367 OpIdx = 0;
47368 if (OpIdx < 0)
47369 break;
47370 SetCC = SetCC.getOperand(OpIdx);
47371 truncatedToBoolWithAnd = true;
47372 } else
47373 SetCC = SetCC.getOperand(0);
47374 }
47375
47376 switch (SetCC.getOpcode()) {
47377 case X86ISD::SETCC_CARRY:
47378 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47379 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47380 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47381 // truncated to i1 using 'and'.
47382 if (checkAgainstTrue && !truncatedToBoolWithAnd)
47383 break;
47384 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__
__PRETTY_FUNCTION__))
47385 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__
__PRETTY_FUNCTION__))
;
47386 [[fallthrough]];
47387 case X86ISD::SETCC:
47388 // Set the condition code or opposite one if necessary.
47389 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
47390 if (needOppositeCond)
47391 CC = X86::GetOppositeBranchCondition(CC);
47392 return SetCC.getOperand(1);
47393 case X86ISD::CMOV: {
47394 // Check whether false/true value has canonical one, i.e. 0 or 1.
47395 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47396 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47397 // Quit if true value is not a constant.
47398 if (!TVal)
47399 return SDValue();
47400 // Quit if false value is not a constant.
47401 if (!FVal) {
47402 SDValue Op = SetCC.getOperand(0);
47403 // Skip 'zext' or 'trunc' node.
47404 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47405 Op.getOpcode() == ISD::TRUNCATE)
47406 Op = Op.getOperand(0);
47407 // A special case for rdrand/rdseed, where 0 is set if false cond is
47408 // found.
47409 if ((Op.getOpcode() != X86ISD::RDRAND &&
47410 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47411 return SDValue();
47412 }
47413 // Quit if false value is not the constant 0 or 1.
47414 bool FValIsFalse = true;
47415 if (FVal && FVal->getZExtValue() != 0) {
47416 if (FVal->getZExtValue() != 1)
47417 return SDValue();
47418 // If FVal is 1, opposite cond is needed.
47419 needOppositeCond = !needOppositeCond;
47420 FValIsFalse = false;
47421 }
47422 // Quit if TVal is not the constant opposite of FVal.
47423 if (FValIsFalse && TVal->getZExtValue() != 1)
47424 return SDValue();
47425 if (!FValIsFalse && TVal->getZExtValue() != 0)
47426 return SDValue();
47427 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
47428 if (needOppositeCond)
47429 CC = X86::GetOppositeBranchCondition(CC);
47430 return SetCC.getOperand(3);
47431 }
47432 }
47433
47434 return SDValue();
47435}
47436
47437/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47438/// Match:
47439/// (X86or (X86setcc) (X86setcc))
47440/// (X86cmp (and (X86setcc) (X86setcc)), 0)
47441static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
47442 X86::CondCode &CC1, SDValue &Flags,
47443 bool &isAnd) {
47444 if (Cond->getOpcode() == X86ISD::CMP) {
47445 if (!isNullConstant(Cond->getOperand(1)))
47446 return false;
47447
47448 Cond = Cond->getOperand(0);
47449 }
47450
47451 isAnd = false;
47452
47453 SDValue SetCC0, SetCC1;
47454 switch (Cond->getOpcode()) {
47455 default: return false;
47456 case ISD::AND:
47457 case X86ISD::AND:
47458 isAnd = true;
47459 [[fallthrough]];
47460 case ISD::OR:
47461 case X86ISD::OR:
47462 SetCC0 = Cond->getOperand(0);
47463 SetCC1 = Cond->getOperand(1);
47464 break;
47465 };
47466
47467 // Make sure we have SETCC nodes, using the same flags value.
47468 if (SetCC0.getOpcode() != X86ISD::SETCC ||
47469 SetCC1.getOpcode() != X86ISD::SETCC ||
47470 SetCC0->getOperand(1) != SetCC1->getOperand(1))
47471 return false;
47472
47473 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47474 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47475 Flags = SetCC0->getOperand(1);
47476 return true;
47477}
47478
47479// When legalizing carry, we create carries via add X, -1
47480// If that comes from an actual carry, via setcc, we use the
47481// carry directly.
47482static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
47483 if (EFLAGS.getOpcode() == X86ISD::ADD) {
47484 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
47485 bool FoundAndLSB = false;
47486 SDValue Carry = EFLAGS.getOperand(0);
47487 while (Carry.getOpcode() == ISD::TRUNCATE ||
47488 Carry.getOpcode() == ISD::ZERO_EXTEND ||
47489 (Carry.getOpcode() == ISD::AND &&
47490 isOneConstant(Carry.getOperand(1)))) {
47491 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
47492 Carry = Carry.getOperand(0);
47493 }
47494 if (Carry.getOpcode() == X86ISD::SETCC ||
47495 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
47496 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
47497 uint64_t CarryCC = Carry.getConstantOperandVal(0);
47498 SDValue CarryOp1 = Carry.getOperand(1);
47499 if (CarryCC == X86::COND_B)
47500 return CarryOp1;
47501 if (CarryCC == X86::COND_A) {
47502 // Try to convert COND_A into COND_B in an attempt to facilitate
47503 // materializing "setb reg".
47504 //
47505 // Do not flip "e > c", where "c" is a constant, because Cmp
47506 // instruction cannot take an immediate as its first operand.
47507 //
47508 if (CarryOp1.getOpcode() == X86ISD::SUB &&
47509 CarryOp1.getNode()->hasOneUse() &&
47510 CarryOp1.getValueType().isInteger() &&
47511 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
47512 SDValue SubCommute =
47513 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
47514 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
47515 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
47516 }
47517 }
47518 // If this is a check of the z flag of an add with 1, switch to the
47519 // C flag.
47520 if (CarryCC == X86::COND_E &&
47521 CarryOp1.getOpcode() == X86ISD::ADD &&
47522 isOneConstant(CarryOp1.getOperand(1)))
47523 return CarryOp1;
47524 } else if (FoundAndLSB) {
47525 SDLoc DL(Carry);
47526 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
47527 if (Carry.getOpcode() == ISD::SRL) {
47528 BitNo = Carry.getOperand(1);
47529 Carry = Carry.getOperand(0);
47530 }
47531 return getBT(Carry, BitNo, DL, DAG);
47532 }
47533 }
47534 }
47535
47536 return SDValue();
47537}
47538
47539/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
47540/// to avoid the inversion.
47541static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
47542 SelectionDAG &DAG,
47543 const X86Subtarget &Subtarget) {
47544 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
47545 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
47546 EFLAGS.getOpcode() != X86ISD::TESTP)
47547 return SDValue();
47548
47549 // PTEST/TESTP sets EFLAGS as:
47550 // TESTZ: ZF = (Op0 & Op1) == 0
47551 // TESTC: CF = (~Op0 & Op1) == 0
47552 // TESTNZC: ZF == 0 && CF == 0
47553 MVT VT = EFLAGS.getSimpleValueType();
47554 SDValue Op0 = EFLAGS.getOperand(0);
47555 SDValue Op1 = EFLAGS.getOperand(1);
47556 MVT OpVT = Op0.getSimpleValueType();
47557
47558 // TEST*(~X,Y) == TEST*(X,Y)
47559 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
47560 X86::CondCode InvCC;
47561 switch (CC) {
47562 case X86::COND_B:
47563 // testc -> testz.
47564 InvCC = X86::COND_E;
47565 break;
47566 case X86::COND_AE:
47567 // !testc -> !testz.
47568 InvCC = X86::COND_NE;
47569 break;
47570 case X86::COND_E:
47571 // testz -> testc.
47572 InvCC = X86::COND_B;
47573 break;
47574 case X86::COND_NE:
47575 // !testz -> !testc.
47576 InvCC = X86::COND_AE;
47577 break;
47578 case X86::COND_A:
47579 case X86::COND_BE:
47580 // testnzc -> testnzc (no change).
47581 InvCC = CC;
47582 break;
47583 default:
47584 InvCC = X86::COND_INVALID;
47585 break;
47586 }
47587
47588 if (InvCC != X86::COND_INVALID) {
47589 CC = InvCC;
47590 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47591 DAG.getBitcast(OpVT, NotOp0), Op1);
47592 }
47593 }
47594
47595 if (CC == X86::COND_B || CC == X86::COND_AE) {
47596 // TESTC(X,~X) == TESTC(X,-1)
47597 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47598 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
47599 SDLoc DL(EFLAGS);
47600 return DAG.getNode(
47601 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
47602 DAG.getBitcast(OpVT,
47603 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
47604 }
47605 }
47606 }
47607
47608 if (CC == X86::COND_E || CC == X86::COND_NE) {
47609 // TESTZ(X,~Y) == TESTC(Y,X)
47610 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47611 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47612 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47613 DAG.getBitcast(OpVT, NotOp1), Op0);
47614 }
47615
47616 if (Op0 == Op1) {
47617 SDValue BC = peekThroughBitcasts(Op0);
47618 EVT BCVT = BC.getValueType();
47619
47620 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
47621 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
47622 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47623 DAG.getBitcast(OpVT, BC.getOperand(0)),
47624 DAG.getBitcast(OpVT, BC.getOperand(1)));
47625 }
47626
47627 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
47628 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
47629 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47630 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47631 DAG.getBitcast(OpVT, BC.getOperand(0)),
47632 DAG.getBitcast(OpVT, BC.getOperand(1)));
47633 }
47634
47635 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
47636 // to more efficiently extract the sign bits and compare that.
47637 // TODO: Handle TESTC with comparison inversion.
47638 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
47639 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
47640 if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
47641 unsigned EltBits = BCVT.getScalarSizeInBits();
47642 if (DAG.ComputeNumSignBits(BC) == EltBits) {
47643 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47643, __extension__
__PRETTY_FUNCTION__))
;
47644 APInt SignMask = APInt::getSignMask(EltBits);
47645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47646 if (SDValue Res =
47647 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
47648 // For vXi16 cases we need to use pmovmksb and extract every other
47649 // sign bit.
47650 SDLoc DL(EFLAGS);
47651 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
47652 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
47653 MVT FloatVT =
47654 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
47655 Res = DAG.getBitcast(FloatVT, Res);
47656 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
47657 } else if (EltBits == 16) {
47658 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
47659 Res = DAG.getBitcast(MovmskVT, Res);
47660 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47661 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
47662 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47663 } else {
47664 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47665 }
47666 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
47667 DAG.getConstant(0, DL, MVT::i32));
47668 }
47669 }
47670 }
47671 }
47672
47673 // TESTZ(-1,X) == TESTZ(X,X)
47674 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
47675 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
47676
47677 // TESTZ(X,-1) == TESTZ(X,X)
47678 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
47679 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
47680
47681 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
47682 // TODO: Add COND_NE handling?
47683 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
47684 SDValue Src0 = peekThroughBitcasts(Op0);
47685 SDValue Src1 = peekThroughBitcasts(Op1);
47686 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
47687 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
47688 peekThroughBitcasts(Src0.getOperand(1)), true);
47689 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
47690 peekThroughBitcasts(Src1.getOperand(1)), true);
47691 if (Src0 && Src1) {
47692 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
47693 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47694 DAG.getBitcast(OpVT2, Src0),
47695 DAG.getBitcast(OpVT2, Src1));
47696 }
47697 }
47698 }
47699 }
47700
47701 return SDValue();
47702}
47703
47704// Attempt to simplify the MOVMSK input based on the comparison type.
47705static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
47706 SelectionDAG &DAG,
47707 const X86Subtarget &Subtarget) {
47708 // Handle eq/ne against zero (any_of).
47709 // Handle eq/ne against -1 (all_of).
47710 if (!(CC == X86::COND_E || CC == X86::COND_NE))
47711 return SDValue();
47712 if (EFLAGS.getValueType() != MVT::i32)
47713 return SDValue();
47714 unsigned CmpOpcode = EFLAGS.getOpcode();
47715 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
47716 return SDValue();
47717 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
47718 if (!CmpConstant)
47719 return SDValue();
47720 const APInt &CmpVal = CmpConstant->getAPIntValue();
47721
47722 SDValue CmpOp = EFLAGS.getOperand(0);
47723 unsigned CmpBits = CmpOp.getValueSizeInBits();
47724 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__
__PRETTY_FUNCTION__))
;
47725
47726 // Peek through any truncate.
47727 if (CmpOp.getOpcode() == ISD::TRUNCATE)
47728 CmpOp = CmpOp.getOperand(0);
47729
47730 // Bail if we don't find a MOVMSK.
47731 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
47732 return SDValue();
47733
47734 SDValue Vec = CmpOp.getOperand(0);
47735 MVT VecVT = Vec.getSimpleValueType();
47736 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__
__PRETTY_FUNCTION__))
47737 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__
__PRETTY_FUNCTION__))
;
47738 unsigned NumElts = VecVT.getVectorNumElements();
47739 unsigned NumEltBits = VecVT.getScalarSizeInBits();
47740
47741 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
47742 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
47743 NumElts <= CmpBits && CmpVal.isMask(NumElts);
47744 if (!IsAnyOf && !IsAllOf)
47745 return SDValue();
47746
47747 // TODO: Check more combining cases for me.
47748 // Here we check the cmp use number to decide do combining or not.
47749 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
47750 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
47751 bool IsOneUse = CmpOp.getNode()->hasOneUse();
47752
47753 // See if we can peek through to a vector with a wider element type, if the
47754 // signbits extend down to all the sub-elements as well.
47755 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
47756 // potential SimplifyDemandedBits/Elts cases.
47757 // If we looked through a truncate that discard bits, we can't do this
47758 // transform.
47759 // FIXME: We could do this transform for truncates that discarded bits by
47760 // inserting an AND mask between the new MOVMSK and the CMP.
47761 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
47762 SDValue BC = peekThroughBitcasts(Vec);
47763 MVT BCVT = BC.getSimpleValueType();
47764 unsigned BCNumElts = BCVT.getVectorNumElements();
47765 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
47766 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
47767 BCNumEltBits > NumEltBits &&
47768 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
47769 SDLoc DL(EFLAGS);
47770 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
47771 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47772 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
47773 DAG.getConstant(CmpMask, DL, MVT::i32));
47774 }
47775 }
47776
47777 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
47778 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
47779 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
47780 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
47781 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
47782 SmallVector<SDValue> Ops;
47783 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
47784 Ops.size() == 2) {
47785 SDLoc DL(EFLAGS);
47786 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
47787 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
47788 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
47789 DAG.getBitcast(SubVT, Ops[0]),
47790 DAG.getBitcast(SubVT, Ops[1]));
47791 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
47792 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47793 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
47794 DAG.getConstant(CmpMask, DL, MVT::i32));
47795 }
47796 }
47797
47798 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
47799 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
47800 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
47801 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
47802 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
47803 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
47804 SDValue BC = peekThroughBitcasts(Vec);
47805 // Ensure MOVMSK was testing every signbit of BC.
47806 if (BC.getValueType().getVectorNumElements() <= NumElts) {
47807 if (BC.getOpcode() == X86ISD::PCMPEQ) {
47808 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
47809 BC.getOperand(0), BC.getOperand(1));
47810 V = DAG.getBitcast(TestVT, V);
47811 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47812 }
47813 // Check for 256-bit split vector cases.
47814 if (BC.getOpcode() == ISD::AND &&
47815 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
47816 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
47817 SDValue LHS = BC.getOperand(0);
47818 SDValue RHS = BC.getOperand(1);
47819 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
47820 LHS.getOperand(0), LHS.getOperand(1));
47821 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
47822 RHS.getOperand(0), RHS.getOperand(1));
47823 LHS = DAG.getBitcast(TestVT, LHS);
47824 RHS = DAG.getBitcast(TestVT, RHS);
47825 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
47826 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47827 }
47828 }
47829 }
47830
47831 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
47832 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
47833 // sign bits prior to the comparison with zero unless we know that
47834 // the vXi16 splats the sign bit down to the lower i8 half.
47835 // TODO: Handle all_of patterns.
47836 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
47837 SDValue VecOp0 = Vec.getOperand(0);
47838 SDValue VecOp1 = Vec.getOperand(1);
47839 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
47840 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
47841 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
47842 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
47843 SDLoc DL(EFLAGS);
47844 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
47845 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47846 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
47847 if (!SignExt0) {
47848 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
47849 DAG.getConstant(0xAAAA, DL, MVT::i16));
47850 }
47851 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47852 DAG.getConstant(0, DL, MVT::i16));
47853 }
47854 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
47855 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
47856 if (CmpBits >= 16 && Subtarget.hasInt256() &&
47857 (IsAnyOf || (SignExt0 && SignExt1))) {
47858 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
47859 SDLoc DL(EFLAGS);
47860 SDValue Result = peekThroughBitcasts(Src);
47861 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
47862 Result.getValueType().getVectorNumElements() <= NumElts) {
47863 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
47864 Result.getOperand(0), Result.getOperand(1));
47865 V = DAG.getBitcast(MVT::v4i64, V);
47866 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47867 }
47868 Result = DAG.getBitcast(MVT::v32i8, Result);
47869 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47870 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
47871 if (!SignExt0 || !SignExt1) {
47872 assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__
__PRETTY_FUNCTION__))
47873 "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__
__PRETTY_FUNCTION__))
;
47874 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
47875 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47876 }
47877 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47878 DAG.getConstant(CmpMask, DL, MVT::i32));
47879 }
47880 }
47881 }
47882
47883 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
47884 SmallVector<int, 32> ShuffleMask;
47885 SmallVector<SDValue, 2> ShuffleInputs;
47886 if (NumElts <= CmpBits &&
47887 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
47888 ShuffleMask, DAG) &&
47889 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
47890 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
47891 unsigned NumShuffleElts = ShuffleMask.size();
47892 APInt DemandedElts = APInt::getZero(NumShuffleElts);
47893 for (int M : ShuffleMask) {
47894 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47894, __extension__
__PRETTY_FUNCTION__))
;
47895 DemandedElts.setBit(M);
47896 }
47897 if (DemandedElts.isAllOnes()) {
47898 SDLoc DL(EFLAGS);
47899 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
47900 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47901 Result =
47902 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
47903 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47904 EFLAGS.getOperand(1));
47905 }
47906 }
47907
47908 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
47909 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
47910 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
47911 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
47912 // iff every element is referenced.
47913 if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse &&
47914 (NumEltBits == 32 || NumEltBits == 64)) {
47915 SDLoc DL(EFLAGS);
47916 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
47917 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
47918 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
47919 SDValue LHS = Vec;
47920 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
47921 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47922 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
47923 DAG.getBitcast(FloatVT, LHS),
47924 DAG.getBitcast(FloatVT, RHS));
47925 }
47926
47927 return SDValue();
47928}
47929
47930/// Optimize an EFLAGS definition used according to the condition code \p CC
47931/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
47932/// uses of chain values.
47933static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
47934 SelectionDAG &DAG,
47935 const X86Subtarget &Subtarget) {
47936 if (CC == X86::COND_B)
47937 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
47938 return Flags;
47939
47940 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
47941 return R;
47942
47943 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
47944 return R;
47945
47946 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
47947 return R;
47948
47949 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
47950}
47951
47952/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
47953static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
47954 TargetLowering::DAGCombinerInfo &DCI,
47955 const X86Subtarget &Subtarget) {
47956 SDLoc DL(N);
47957
47958 SDValue FalseOp = N->getOperand(0);
47959 SDValue TrueOp = N->getOperand(1);
47960 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
47961 SDValue Cond = N->getOperand(3);
47962
47963 // cmov X, X, ?, ? --> X
47964 if (TrueOp == FalseOp)
47965 return TrueOp;
47966
47967 // Try to simplify the EFLAGS and condition code operands.
47968 // We can't always do this as FCMOV only supports a subset of X86 cond.
47969 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
47970 if (!(FalseOp.getValueType() == MVT::f80 ||
47971 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
47972 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
47973 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
47974 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
47975 Flags};
47976 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47977 }
47978 }
47979
47980 // If this is a select between two integer constants, try to do some
47981 // optimizations. Note that the operands are ordered the opposite of SELECT
47982 // operands.
47983 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
47984 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
47985 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
47986 // larger than FalseC (the false value).
47987 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
47988 CC = X86::GetOppositeBranchCondition(CC);
47989 std::swap(TrueC, FalseC);
47990 std::swap(TrueOp, FalseOp);
47991 }
47992
47993 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
47994 // This is efficient for any integer data type (including i8/i16) and
47995 // shift amount.
47996 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
47997 Cond = getSETCC(CC, Cond, DL, DAG);
47998
47999 // Zero extend the condition if needed.
48000 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48001
48002 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48003 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
48004 DAG.getConstant(ShAmt, DL, MVT::i8));
48005 return Cond;
48006 }
48007
48008 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
48009 // for any integer data type, including i8/i16.
48010 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48011 Cond = getSETCC(CC, Cond, DL, DAG);
48012
48013 // Zero extend the condition if needed.
48014 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
48015 FalseC->getValueType(0), Cond);
48016 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48017 SDValue(FalseC, 0));
48018 return Cond;
48019 }
48020
48021 // Optimize cases that will turn into an LEA instruction. This requires
48022 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
48023 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
48024 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48025 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__
__PRETTY_FUNCTION__))
48026 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__
__PRETTY_FUNCTION__))
;
48027
48028 bool isFastMultiplier = false;
48029 if (Diff.ult(10)) {
48030 switch (Diff.getZExtValue()) {
48031 default: break;
48032 case 1: // result = add base, cond
48033 case 2: // result = lea base( , cond*2)
48034 case 3: // result = lea base(cond, cond*2)
48035 case 4: // result = lea base( , cond*4)
48036 case 5: // result = lea base(cond, cond*4)
48037 case 8: // result = lea base( , cond*8)
48038 case 9: // result = lea base(cond, cond*8)
48039 isFastMultiplier = true;
48040 break;
48041 }
48042 }
48043
48044 if (isFastMultiplier) {
48045 Cond = getSETCC(CC, Cond, DL ,DAG);
48046 // Zero extend the condition if needed.
48047 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48048 Cond);
48049 // Scale the condition by the difference.
48050 if (Diff != 1)
48051 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
48052 DAG.getConstant(Diff, DL, Cond.getValueType()));
48053
48054 // Add the base if non-zero.
48055 if (FalseC->getAPIntValue() != 0)
48056 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48057 SDValue(FalseC, 0));
48058 return Cond;
48059 }
48060 }
48061 }
48062 }
48063
48064 // Handle these cases:
48065 // (select (x != c), e, c) -> select (x != c), e, x),
48066 // (select (x == c), c, e) -> select (x == c), x, e)
48067 // where the c is an integer constant, and the "select" is the combination
48068 // of CMOV and CMP.
48069 //
48070 // The rationale for this change is that the conditional-move from a constant
48071 // needs two instructions, however, conditional-move from a register needs
48072 // only one instruction.
48073 //
48074 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
48075 // some instruction-combining opportunities. This opt needs to be
48076 // postponed as late as possible.
48077 //
48078 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
48079 // the DCI.xxxx conditions are provided to postpone the optimization as
48080 // late as possible.
48081
48082 ConstantSDNode *CmpAgainst = nullptr;
48083 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
48084 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
48085 !isa<ConstantSDNode>(Cond.getOperand(0))) {
48086
48087 if (CC == X86::COND_NE &&
48088 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
48089 CC = X86::GetOppositeBranchCondition(CC);
48090 std::swap(TrueOp, FalseOp);
48091 }
48092
48093 if (CC == X86::COND_E &&
48094 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
48095 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
48096 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
48097 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48098 }
48099 }
48100 }
48101
48102 // Transform:
48103 //
48104 // (cmov 1 T (uge T 2))
48105 //
48106 // to:
48107 //
48108 // (adc T 0 (sub T 1))
48109 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
48110 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48111 SDValue Cond0 = Cond.getOperand(0);
48112 if (Cond0.getOpcode() == ISD::TRUNCATE)
48113 Cond0 = Cond0.getOperand(0);
48114 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
48115 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48116 EVT CondVT = Cond->getValueType(0);
48117 EVT OuterVT = N->getValueType(0);
48118 // Subtract 1 and generate a carry.
48119 SDValue NewSub =
48120 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48121 DAG.getConstant(1, DL, CondVT));
48122 SDValue EFLAGS(NewSub.getNode(), 1);
48123 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
48124 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
48125 }
48126 }
48127
48128 // Fold and/or of setcc's to double CMOV:
48129 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48130 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48131 //
48132 // This combine lets us generate:
48133 // cmovcc1 (jcc1 if we don't have CMOV)
48134 // cmovcc2 (same)
48135 // instead of:
48136 // setcc1
48137 // setcc2
48138 // and/or
48139 // cmovne (jne if we don't have CMOV)
48140 // When we can't use the CMOV instruction, it might increase branch
48141 // mispredicts.
48142 // When we can use CMOV, or when there is no mispredict, this improves
48143 // throughput and reduces register pressure.
48144 //
48145 if (CC == X86::COND_NE) {
48146 SDValue Flags;
48147 X86::CondCode CC0, CC1;
48148 bool isAndSetCC;
48149 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
48150 if (isAndSetCC) {
48151 std::swap(FalseOp, TrueOp);
48152 CC0 = X86::GetOppositeBranchCondition(CC0);
48153 CC1 = X86::GetOppositeBranchCondition(CC1);
48154 }
48155
48156 SDValue LOps[] = {FalseOp, TrueOp,
48157 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
48158 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
48159 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
48160 Flags};
48161 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48162 return CMOV;
48163 }
48164 }
48165
48166 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48167 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48168 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48169 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48170 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
48171 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
48172 SDValue Add = TrueOp;
48173 SDValue Const = FalseOp;
48174 // Canonicalize the condition code for easier matching and output.
48175 if (CC == X86::COND_E)
48176 std::swap(Add, Const);
48177
48178 // We might have replaced the constant in the cmov with the LHS of the
48179 // compare. If so change it to the RHS of the compare.
48180 if (Const == Cond.getOperand(0))
48181 Const = Cond.getOperand(1);
48182
48183 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
48184 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
48185 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
48186 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
48187 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
48188 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
48189 EVT VT = N->getValueType(0);
48190 // This should constant fold.
48191 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
48192 SDValue CMov =
48193 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
48194 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
48195 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
48196 }
48197 }
48198
48199 return SDValue();
48200}
48201
48202/// Different mul shrinking modes.
48203enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
48204
48205static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
48206 EVT VT = N->getOperand(0).getValueType();
48207 if (VT.getScalarSizeInBits() != 32)
48208 return false;
48209
48210 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48210, __extension__
__PRETTY_FUNCTION__))
;
48211 unsigned SignBits[2] = {1, 1};
48212 bool IsPositive[2] = {false, false};
48213 for (unsigned i = 0; i < 2; i++) {
48214 SDValue Opd = N->getOperand(i);
48215
48216 SignBits[i] = DAG.ComputeNumSignBits(Opd);
48217 IsPositive[i] = DAG.SignBitIsZero(Opd);
48218 }
48219
48220 bool AllPositive = IsPositive[0] && IsPositive[1];
48221 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
48222 // When ranges are from -128 ~ 127, use MULS8 mode.
48223 if (MinSignBits >= 25)
48224 Mode = ShrinkMode::MULS8;
48225 // When ranges are from 0 ~ 255, use MULU8 mode.
48226 else if (AllPositive && MinSignBits >= 24)
48227 Mode = ShrinkMode::MULU8;
48228 // When ranges are from -32768 ~ 32767, use MULS16 mode.
48229 else if (MinSignBits >= 17)
48230 Mode = ShrinkMode::MULS16;
48231 // When ranges are from 0 ~ 65535, use MULU16 mode.
48232 else if (AllPositive && MinSignBits >= 16)
48233 Mode = ShrinkMode::MULU16;
48234 else
48235 return false;
48236 return true;
48237}
48238
48239/// When the operands of vector mul are extended from smaller size values,
48240/// like i8 and i16, the type of mul may be shrinked to generate more
48241/// efficient code. Two typical patterns are handled:
48242/// Pattern1:
48243/// %2 = sext/zext <N x i8> %1 to <N x i32>
48244/// %4 = sext/zext <N x i8> %3 to <N x i32>
48245// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48246/// %5 = mul <N x i32> %2, %4
48247///
48248/// Pattern2:
48249/// %2 = zext/sext <N x i16> %1 to <N x i32>
48250/// %4 = zext/sext <N x i16> %3 to <N x i32>
48251/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48252/// %5 = mul <N x i32> %2, %4
48253///
48254/// There are four mul shrinking modes:
48255/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
48256/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48257/// generate pmullw+sext32 for it (MULS8 mode).
48258/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
48259/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
48260/// generate pmullw+zext32 for it (MULU8 mode).
48261/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
48262/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48263/// generate pmullw+pmulhw for it (MULS16 mode).
48264/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
48265/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
48266/// generate pmullw+pmulhuw for it (MULU16 mode).
48267static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
48268 const X86Subtarget &Subtarget) {
48269 // Check for legality
48270 // pmullw/pmulhw are not supported by SSE.
48271 if (!Subtarget.hasSSE2())
48272 return SDValue();
48273
48274 // Check for profitability
48275 // pmulld is supported since SSE41. It is better to use pmulld
48276 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
48277 // the expansion.
48278 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
48279 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
48280 return SDValue();
48281
48282 ShrinkMode Mode;
48283 if (!canReduceVMulWidth(N, DAG, Mode))
48284 return SDValue();
48285
48286 SDLoc DL(N);
48287 SDValue N0 = N->getOperand(0);
48288 SDValue N1 = N->getOperand(1);
48289 EVT VT = N->getOperand(0).getValueType();
48290 unsigned NumElts = VT.getVectorNumElements();
48291 if ((NumElts % 2) != 0)
48292 return SDValue();
48293
48294 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
48295
48296 // Shrink the operands of mul.
48297 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
48298 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
48299
48300 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
48301 // lower part is needed.
48302 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48303 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48304 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48305 : ISD::SIGN_EXTEND,
48306 DL, VT, MulLo);
48307
48308 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48309 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48310 // the higher part is also needed.
48311 SDValue MulHi =
48312 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48313 ReducedVT, NewN0, NewN1);
48314
48315 // Repack the lower part and higher part result of mul into a wider
48316 // result.
48317 // Generate shuffle functioning as punpcklwd.
48318 SmallVector<int, 16> ShuffleMask(NumElts);
48319 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48320 ShuffleMask[2 * i] = i;
48321 ShuffleMask[2 * i + 1] = i + NumElts;
48322 }
48323 SDValue ResLo =
48324 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48325 ResLo = DAG.getBitcast(ResVT, ResLo);
48326 // Generate shuffle functioning as punpckhwd.
48327 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48328 ShuffleMask[2 * i] = i + NumElts / 2;
48329 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48330 }
48331 SDValue ResHi =
48332 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48333 ResHi = DAG.getBitcast(ResVT, ResHi);
48334 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48335}
48336
48337static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
48338 EVT VT, const SDLoc &DL) {
48339
48340 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48341 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48342 DAG.getConstant(Mult, DL, VT));
48343 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48344 DAG.getConstant(Shift, DL, MVT::i8));
48345 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48346 N->getOperand(0));
48347 return Result;
48348 };
48349
48350 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48351 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48352 DAG.getConstant(Mul1, DL, VT));
48353 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48354 DAG.getConstant(Mul2, DL, VT));
48355 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48356 N->getOperand(0));
48357 return Result;
48358 };
48359
48360 switch (MulAmt) {
48361 default:
48362 break;
48363 case 11:
48364 // mul x, 11 => add ((shl (mul x, 5), 1), x)
48365 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48366 case 21:
48367 // mul x, 21 => add ((shl (mul x, 5), 2), x)
48368 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48369 case 41:
48370 // mul x, 41 => add ((shl (mul x, 5), 3), x)
48371 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48372 case 22:
48373 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48374 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48375 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48376 case 19:
48377 // mul x, 19 => add ((shl (mul x, 9), 1), x)
48378 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48379 case 37:
48380 // mul x, 37 => add ((shl (mul x, 9), 2), x)
48381 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48382 case 73:
48383 // mul x, 73 => add ((shl (mul x, 9), 3), x)
48384 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48385 case 13:
48386 // mul x, 13 => add ((shl (mul x, 3), 2), x)
48387 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48388 case 23:
48389 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48390 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48391 case 26:
48392 // mul x, 26 => add ((mul (mul x, 5), 5), x)
48393 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48394 case 28:
48395 // mul x, 28 => add ((mul (mul x, 9), 3), x)
48396 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48397 case 29:
48398 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48399 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48400 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48401 }
48402
48403 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48404 // by a single LEA.
48405 // First check if this a sum of two power of 2s because that's easy. Then
48406 // count how many zeros are up to the first bit.
48407 // TODO: We can do this even without LEA at a cost of two shifts and an add.
48408 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48409 unsigned ScaleShift = llvm::countr_zero(MulAmt);
48410 if (ScaleShift >= 1 && ScaleShift < 4) {
48411 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48412 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48413 DAG.getConstant(ShiftAmt, DL, MVT::i8));
48414 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48415 DAG.getConstant(ScaleShift, DL, MVT::i8));
48416 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48417 }
48418 }
48419
48420 return SDValue();
48421}
48422
48423// If the upper 17 bits of either element are zero and the other element are
48424// zero/sign bits then we can use PMADDWD, which is always at least as quick as
48425// PMULLD, except on KNL.
48426static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
48427 const X86Subtarget &Subtarget) {
48428 if (!Subtarget.hasSSE2())
48429 return SDValue();
48430
48431 if (Subtarget.isPMADDWDSlow())
48432 return SDValue();
48433
48434 EVT VT = N->getValueType(0);
48435
48436 // Only support vXi32 vectors.
48437 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
48438 return SDValue();
48439
48440 // Make sure the type is legal or can split/widen to a legal type.
48441 // With AVX512 but without BWI, we would need to split v32i16.
48442 unsigned NumElts = VT.getVectorNumElements();
48443 if (NumElts == 1 || !isPowerOf2_32(NumElts))
48444 return SDValue();
48445
48446 // With AVX512 but without BWI, we would need to split v32i16.
48447 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
48448 return SDValue();
48449
48450 SDValue N0 = N->getOperand(0);
48451 SDValue N1 = N->getOperand(1);
48452
48453 // If we are zero/sign extending two steps without SSE4.1, its better to
48454 // reduce the vmul width instead.
48455 if (!Subtarget.hasSSE41() &&
48456 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
48457 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48458 (N1.getOpcode() == ISD::ZERO_EXTEND &&
48459 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
48460 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
48461 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48462 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48463 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
48464 return SDValue();
48465
48466 // If we are sign extending a wide vector without SSE4.1, its better to reduce
48467 // the vmul width instead.
48468 if (!Subtarget.hasSSE41() &&
48469 (N0.getOpcode() == ISD::SIGN_EXTEND &&
48470 N0.getOperand(0).getValueSizeInBits() > 128) &&
48471 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48472 N1.getOperand(0).getValueSizeInBits() > 128))
48473 return SDValue();
48474
48475 // Sign bits must extend down to the lowest i16.
48476 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
48477 DAG.ComputeMaxSignificantBits(N0) > 16)
48478 return SDValue();
48479
48480 // At least one of the elements must be zero in the upper 17 bits, or can be
48481 // safely made zero without altering the final result.
48482 auto GetZeroableOp = [&](SDValue Op) {
48483 APInt Mask17 = APInt::getHighBitsSet(32, 17);
48484 if (DAG.MaskedValueIsZero(Op, Mask17))
48485 return Op;
48486 // Mask off upper 16-bits of sign-extended constants.
48487 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
48488 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
48489 DAG.getConstant(0xFFFF, SDLoc(N), VT));
48490 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
48491 SDValue Src = Op.getOperand(0);
48492 // Convert sext(vXi16) to zext(vXi16).
48493 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
48494 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48495 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
48496 // which will expand the extension.
48497 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
48498 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
48499 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
48500 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48501 }
48502 }
48503 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
48504 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
48505 N->isOnlyUserOf(Op.getNode())) {
48506 SDValue Src = Op.getOperand(0);
48507 if (Src.getScalarValueSizeInBits() == 16)
48508 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
48509 }
48510 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
48511 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
48512 N->isOnlyUserOf(Op.getNode())) {
48513 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
48514 Op.getOperand(1));
48515 }
48516 return SDValue();
48517 };
48518 SDValue ZeroN0 = GetZeroableOp(N0);
48519 SDValue ZeroN1 = GetZeroableOp(N1);
48520 if (!ZeroN0 && !ZeroN1)
48521 return SDValue();
48522 N0 = ZeroN0 ? ZeroN0 : N0;
48523 N1 = ZeroN1 ? ZeroN1 : N1;
48524
48525 // Use SplitOpsAndApply to handle AVX splitting.
48526 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48527 ArrayRef<SDValue> Ops) {
48528 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
48529 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
48530 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
48531 DAG.getBitcast(OpVT, Ops[0]),
48532 DAG.getBitcast(OpVT, Ops[1]));
48533 };
48534 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
48535 PMADDWDBuilder);
48536}
48537
48538static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
48539 const X86Subtarget &Subtarget) {
48540 if (!Subtarget.hasSSE2())
48541 return SDValue();
48542
48543 EVT VT = N->getValueType(0);
48544
48545 // Only support vXi64 vectors.
48546 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
48547 VT.getVectorNumElements() < 2 ||
48548 !isPowerOf2_32(VT.getVectorNumElements()))
48549 return SDValue();
48550
48551 SDValue N0 = N->getOperand(0);
48552 SDValue N1 = N->getOperand(1);
48553
48554 // MULDQ returns the 64-bit result of the signed multiplication of the lower
48555 // 32-bits. We can lower with this if the sign bits stretch that far.
48556 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
48557 DAG.ComputeNumSignBits(N1) > 32) {
48558 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48559 ArrayRef<SDValue> Ops) {
48560 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
48561 };
48562 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48563 PMULDQBuilder, /*CheckBWI*/false);
48564 }
48565
48566 // If the upper bits are zero we can use a single pmuludq.
48567 APInt Mask = APInt::getHighBitsSet(64, 32);
48568 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
48569 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48570 ArrayRef<SDValue> Ops) {
48571 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
48572 };
48573 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48574 PMULUDQBuilder, /*CheckBWI*/false);
48575 }
48576
48577 return SDValue();
48578}
48579
48580static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
48581 TargetLowering::DAGCombinerInfo &DCI,
48582 const X86Subtarget &Subtarget) {
48583 EVT VT = N->getValueType(0);
48584
48585 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
48586 return V;
48587
48588 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
48589 return V;
48590
48591 if (DCI.isBeforeLegalize() && VT.isVector())
48592 return reduceVMULWidth(N, DAG, Subtarget);
48593
48594 // Optimize a single multiply with constant into two operations in order to
48595 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
48596 if (!MulConstantOptimization)
48597 return SDValue();
48598
48599 // An imul is usually smaller than the alternative sequence.
48600 if (DAG.getMachineFunction().getFunction().hasMinSize())
48601 return SDValue();
48602
48603 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
48604 return SDValue();
48605
48606 if (VT != MVT::i64 && VT != MVT::i32)
48607 return SDValue();
48608
48609 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
48610 if (!C)
48611 return SDValue();
48612 if (isPowerOf2_64(C->getZExtValue()))
48613 return SDValue();
48614
48615 int64_t SignMulAmt = C->getSExtValue();
48616 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48616, __extension__
__PRETTY_FUNCTION__))
;
48617 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
48618
48619 SDLoc DL(N);
48620 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
48621 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48622 DAG.getConstant(AbsMulAmt, DL, VT));
48623 if (SignMulAmt < 0)
48624 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48625 NewMul);
48626
48627 return NewMul;
48628 }
48629
48630 uint64_t MulAmt1 = 0;
48631 uint64_t MulAmt2 = 0;
48632 if ((AbsMulAmt % 9) == 0) {
48633 MulAmt1 = 9;
48634 MulAmt2 = AbsMulAmt / 9;
48635 } else if ((AbsMulAmt % 5) == 0) {
48636 MulAmt1 = 5;
48637 MulAmt2 = AbsMulAmt / 5;
48638 } else if ((AbsMulAmt % 3) == 0) {
48639 MulAmt1 = 3;
48640 MulAmt2 = AbsMulAmt / 3;
48641 }
48642
48643 SDValue NewMul;
48644 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
48645 if (MulAmt2 &&
48646 (isPowerOf2_64(MulAmt2) ||
48647 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
48648
48649 if (isPowerOf2_64(MulAmt2) &&
48650 !(SignMulAmt >= 0 && N->hasOneUse() &&
48651 N->use_begin()->getOpcode() == ISD::ADD))
48652 // If second multiplifer is pow2, issue it first. We want the multiply by
48653 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
48654 // is an add. Only do this for positive multiply amounts since the
48655 // negate would prevent it from being used as an address mode anyway.
48656 std::swap(MulAmt1, MulAmt2);
48657
48658 if (isPowerOf2_64(MulAmt1))
48659 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48660 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
48661 else
48662 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48663 DAG.getConstant(MulAmt1, DL, VT));
48664
48665 if (isPowerOf2_64(MulAmt2))
48666 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
48667 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
48668 else
48669 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
48670 DAG.getConstant(MulAmt2, DL, VT));
48671
48672 // Negate the result.
48673 if (SignMulAmt < 0)
48674 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48675 NewMul);
48676 } else if (!Subtarget.slowLEA())
48677 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
48678
48679 if (!NewMul) {
48680 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
48681 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
48682 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
48683 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
;
48684 if (isPowerOf2_64(AbsMulAmt - 1)) {
48685 // (mul x, 2^N + 1) => (add (shl x, N), x)
48686 NewMul = DAG.getNode(
48687 ISD::ADD, DL, VT, N->getOperand(0),
48688 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48689 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
48690 MVT::i8)));
48691 // To negate, subtract the number from zero
48692 if (SignMulAmt < 0)
48693 NewMul = DAG.getNode(ISD::SUB, DL, VT,
48694 DAG.getConstant(0, DL, VT), NewMul);
48695 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
48696 // (mul x, 2^N - 1) => (sub (shl x, N), x)
48697 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48698 DAG.getConstant(Log2_64(AbsMulAmt + 1),
48699 DL, MVT::i8));
48700 // To negate, reverse the operands of the subtract.
48701 if (SignMulAmt < 0)
48702 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
48703 else
48704 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
48705 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
48706 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
48707 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48708 DAG.getConstant(Log2_64(AbsMulAmt - 2),
48709 DL, MVT::i8));
48710 NewMul = DAG.getNode(
48711 ISD::ADD, DL, VT, NewMul,
48712 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48713 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
48714 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
48715 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48716 DAG.getConstant(Log2_64(AbsMulAmt + 2),
48717 DL, MVT::i8));
48718 NewMul = DAG.getNode(
48719 ISD::SUB, DL, VT, NewMul,
48720 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48721 }
48722 }
48723
48724 return NewMul;
48725}
48726
48727// Try to form a MULHU or MULHS node by looking for
48728// (srl (mul ext, ext), 16)
48729// TODO: This is X86 specific because we want to be able to handle wide types
48730// before type legalization. But we can only do it if the vector will be
48731// legalized via widening/splitting. Type legalization can't handle promotion
48732// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
48733// combiner.
48734static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
48735 const X86Subtarget &Subtarget) {
48736 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__
__PRETTY_FUNCTION__))
48737 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__
__PRETTY_FUNCTION__))
;
48738 SDLoc DL(N);
48739
48740 if (!Subtarget.hasSSE2())
48741 return SDValue();
48742
48743 // The operation feeding into the shift must be a multiply.
48744 SDValue ShiftOperand = N->getOperand(0);
48745 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
48746 return SDValue();
48747
48748 // Input type should be at least vXi32.
48749 EVT VT = N->getValueType(0);
48750 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
48751 return SDValue();
48752
48753 // Need a shift by 16.
48754 APInt ShiftAmt;
48755 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
48756 ShiftAmt != 16)
48757 return SDValue();
48758
48759 SDValue LHS = ShiftOperand.getOperand(0);
48760 SDValue RHS = ShiftOperand.getOperand(1);
48761
48762 unsigned ExtOpc = LHS.getOpcode();
48763 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
48764 RHS.getOpcode() != ExtOpc)
48765 return SDValue();
48766
48767 // Peek through the extends.
48768 LHS = LHS.getOperand(0);
48769 RHS = RHS.getOperand(0);
48770
48771 // Ensure the input types match.
48772 EVT MulVT = LHS.getValueType();
48773 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
48774 return SDValue();
48775
48776 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
48777 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
48778
48779 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48780 return DAG.getNode(ExtOpc, DL, VT, Mulh);
48781}
48782
48783static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
48784 SDValue N0 = N->getOperand(0);
48785 SDValue N1 = N->getOperand(1);
48786 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
48787 EVT VT = N0.getValueType();
48788
48789 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
48790 // since the result of setcc_c is all zero's or all ones.
48791 if (VT.isInteger() && !VT.isVector() &&
48792 N1C && N0.getOpcode() == ISD::AND &&
48793 N0.getOperand(1).getOpcode() == ISD::Constant) {
48794 SDValue N00 = N0.getOperand(0);
48795 APInt Mask = N0.getConstantOperandAPInt(1);
48796 Mask <<= N1C->getAPIntValue();
48797 bool MaskOK = false;
48798 // We can handle cases concerning bit-widening nodes containing setcc_c if
48799 // we carefully interrogate the mask to make sure we are semantics
48800 // preserving.
48801 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
48802 // of the underlying setcc_c operation if the setcc_c was zero extended.
48803 // Consider the following example:
48804 // zext(setcc_c) -> i32 0x0000FFFF
48805 // c1 -> i32 0x0000FFFF
48806 // c2 -> i32 0x00000001
48807 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
48808 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
48809 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
48810 MaskOK = true;
48811 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
48812 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48813 MaskOK = true;
48814 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
48815 N00.getOpcode() == ISD::ANY_EXTEND) &&
48816 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48817 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
48818 }
48819 if (MaskOK && Mask != 0) {
48820 SDLoc DL(N);
48821 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
48822 }
48823 }
48824
48825 return SDValue();
48826}
48827
48828static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
48829 const X86Subtarget &Subtarget) {
48830 SDValue N0 = N->getOperand(0);
48831 SDValue N1 = N->getOperand(1);
48832 EVT VT = N0.getValueType();
48833 unsigned Size = VT.getSizeInBits();
48834
48835 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48836 return V;
48837
48838 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
48839 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
48840 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
48841 // depending on sign of (SarConst - [56,48,32,24,16])
48842
48843 // sexts in X86 are MOVs. The MOVs have the same code size
48844 // as above SHIFTs (only SHIFT on 1 has lower code size).
48845 // However the MOVs have 2 advantages to a SHIFT:
48846 // 1. MOVs can write to a register that differs from source
48847 // 2. MOVs accept memory operands
48848
48849 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
48850 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
48851 N0.getOperand(1).getOpcode() != ISD::Constant)
48852 return SDValue();
48853
48854 SDValue N00 = N0.getOperand(0);
48855 SDValue N01 = N0.getOperand(1);
48856 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
48857 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
48858 EVT CVT = N1.getValueType();
48859
48860 if (SarConst.isNegative())
48861 return SDValue();
48862
48863 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
48864 unsigned ShiftSize = SVT.getSizeInBits();
48865 // skipping types without corresponding sext/zext and
48866 // ShlConst that is not one of [56,48,32,24,16]
48867 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
48868 continue;
48869 SDLoc DL(N);
48870 SDValue NN =
48871 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
48872 SarConst = SarConst - (Size - ShiftSize);
48873 if (SarConst == 0)
48874 return NN;
48875 if (SarConst.isNegative())
48876 return DAG.getNode(ISD::SHL, DL, VT, NN,
48877 DAG.getConstant(-SarConst, DL, CVT));
48878 return DAG.getNode(ISD::SRA, DL, VT, NN,
48879 DAG.getConstant(SarConst, DL, CVT));
48880 }
48881 return SDValue();
48882}
48883
48884static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
48885 TargetLowering::DAGCombinerInfo &DCI,
48886 const X86Subtarget &Subtarget) {
48887 SDValue N0 = N->getOperand(0);
48888 SDValue N1 = N->getOperand(1);
48889 EVT VT = N0.getValueType();
48890
48891 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48892 return V;
48893
48894 // Only do this on the last DAG combine as it can interfere with other
48895 // combines.
48896 if (!DCI.isAfterLegalizeDAG())
48897 return SDValue();
48898
48899 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
48900 // TODO: This is a generic DAG combine that became an x86-only combine to
48901 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
48902 // and-not ('andn').
48903 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
48904 return SDValue();
48905
48906 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
48907 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
48908 if (!ShiftC || !AndC)
48909 return SDValue();
48910
48911 // If we can shrink the constant mask below 8-bits or 32-bits, then this
48912 // transform should reduce code size. It may also enable secondary transforms
48913 // from improved known-bits analysis or instruction selection.
48914 APInt MaskVal = AndC->getAPIntValue();
48915
48916 // If this can be matched by a zero extend, don't optimize.
48917 if (MaskVal.isMask()) {
48918 unsigned TO = MaskVal.countr_one();
48919 if (TO >= 8 && isPowerOf2_32(TO))
48920 return SDValue();
48921 }
48922
48923 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
48924 unsigned OldMaskSize = MaskVal.getSignificantBits();
48925 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
48926 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
48927 (OldMaskSize > 32 && NewMaskSize <= 32)) {
48928 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
48929 SDLoc DL(N);
48930 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
48931 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
48932 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
48933 }
48934 return SDValue();
48935}
48936
48937static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
48938 const X86Subtarget &Subtarget) {
48939 unsigned Opcode = N->getOpcode();
48940 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48940, __extension__
__PRETTY_FUNCTION__))
;
48941
48942 SDLoc DL(N);
48943 EVT VT = N->getValueType(0);
48944 SDValue N0 = N->getOperand(0);
48945 SDValue N1 = N->getOperand(1);
48946 EVT SrcVT = N0.getValueType();
48947
48948 SDValue BC0 =
48949 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
48950 SDValue BC1 =
48951 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
48952
48953 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
48954 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
48955 // truncation trees that help us avoid lane crossing shuffles.
48956 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
48957 // TODO: We don't handle vXf64 shuffles yet.
48958 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48959 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
48960 SmallVector<SDValue> ShuffleOps;
48961 SmallVector<int> ShuffleMask, ScaledMask;
48962 SDValue Vec = peekThroughBitcasts(BCSrc);
48963 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
48964 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
48965 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
48966 // shuffle to a v4X64 width - we can probably relax this in the future.
48967 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
48968 ShuffleOps[0].getValueType().is256BitVector() &&
48969 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
48970 SDValue Lo, Hi;
48971 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48972 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
48973 Lo = DAG.getBitcast(SrcVT, Lo);
48974 Hi = DAG.getBitcast(SrcVT, Hi);
48975 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
48976 Res = DAG.getBitcast(ShufVT, Res);
48977 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
48978 return DAG.getBitcast(VT, Res);
48979 }
48980 }
48981 }
48982 }
48983
48984 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
48985 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48986 // If either/both ops are a shuffle that can scale to v2x64,
48987 // then see if we can perform this as a v4x32 post shuffle.
48988 SmallVector<SDValue> Ops0, Ops1;
48989 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
48990 bool IsShuf0 =
48991 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48992 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48993 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48994 bool IsShuf1 =
48995 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48996 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
48997 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48998 if (IsShuf0 || IsShuf1) {
48999 if (!IsShuf0) {
49000 Ops0.assign({BC0});
49001 ScaledMask0.assign({0, 1});
49002 }
49003 if (!IsShuf1) {
49004 Ops1.assign({BC1});
49005 ScaledMask1.assign({0, 1});
49006 }
49007
49008 SDValue LHS, RHS;
49009 int PostShuffle[4] = {-1, -1, -1, -1};
49010 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
49011 if (M < 0)
49012 return true;
49013 Idx = M % 2;
49014 SDValue Src = Ops[M / 2];
49015 if (!LHS || LHS == Src) {
49016 LHS = Src;
49017 return true;
49018 }
49019 if (!RHS || RHS == Src) {
49020 Idx += 2;
49021 RHS = Src;
49022 return true;
49023 }
49024 return false;
49025 };
49026 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
49027 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
49028 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
49029 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
49030 LHS = DAG.getBitcast(SrcVT, LHS);
49031 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
49032 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49033 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
49034 Res = DAG.getBitcast(ShufVT, Res);
49035 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
49036 return DAG.getBitcast(VT, Res);
49037 }
49038 }
49039 }
49040
49041 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49042 if (VT.is256BitVector() && Subtarget.hasInt256()) {
49043 SmallVector<int> Mask0, Mask1;
49044 SmallVector<SDValue> Ops0, Ops1;
49045 SmallVector<int, 2> ScaledMask0, ScaledMask1;
49046 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49047 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49048 !Ops0.empty() && !Ops1.empty() &&
49049 all_of(Ops0,
49050 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49051 all_of(Ops1,
49052 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49053 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49054 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
49055 SDValue Op00 = peekThroughBitcasts(Ops0.front());
49056 SDValue Op10 = peekThroughBitcasts(Ops1.front());
49057 SDValue Op01 = peekThroughBitcasts(Ops0.back());
49058 SDValue Op11 = peekThroughBitcasts(Ops1.back());
49059 if ((Op00 == Op11) && (Op01 == Op10)) {
49060 std::swap(Op10, Op11);
49061 ShuffleVectorSDNode::commuteMask(ScaledMask1);
49062 }
49063 if ((Op00 == Op10) && (Op01 == Op11)) {
49064 const int Map[4] = {0, 2, 1, 3};
49065 SmallVector<int, 4> ShuffleMask(
49066 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
49067 Map[ScaledMask1[1]]});
49068 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
49069 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
49070 DAG.getBitcast(SrcVT, Op01));
49071 Res = DAG.getBitcast(ShufVT, Res);
49072 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
49073 return DAG.getBitcast(VT, Res);
49074 }
49075 }
49076 }
49077
49078 return SDValue();
49079}
49080
49081static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
49082 TargetLowering::DAGCombinerInfo &DCI,
49083 const X86Subtarget &Subtarget) {
49084 unsigned Opcode = N->getOpcode();
49085 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
49086 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
;
49087
49088 EVT VT = N->getValueType(0);
49089 SDValue N0 = N->getOperand(0);
49090 SDValue N1 = N->getOperand(1);
49091 unsigned NumDstElts = VT.getVectorNumElements();
49092 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
49093 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
49094 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))
49095 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))
49096 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))
;
49097
49098 bool IsSigned = (X86ISD::PACKSS == Opcode);
49099
49100 // Constant Folding.
49101 APInt UndefElts0, UndefElts1;
49102 SmallVector<APInt, 32> EltBits0, EltBits1;
49103 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49104 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49105 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
49106 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
49107 unsigned NumLanes = VT.getSizeInBits() / 128;
49108 unsigned NumSrcElts = NumDstElts / 2;
49109 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
49110 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
49111
49112 APInt Undefs(NumDstElts, 0);
49113 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
49114 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
49115 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
49116 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
49117 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
49118 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
49119
49120 if (UndefElts[SrcIdx]) {
49121 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
49122 continue;
49123 }
49124
49125 APInt &Val = EltBits[SrcIdx];
49126 if (IsSigned) {
49127 // PACKSS: Truncate signed value with signed saturation.
49128 // Source values less than dst minint are saturated to minint.
49129 // Source values greater than dst maxint are saturated to maxint.
49130 if (Val.isSignedIntN(DstBitsPerElt))
49131 Val = Val.trunc(DstBitsPerElt);
49132 else if (Val.isNegative())
49133 Val = APInt::getSignedMinValue(DstBitsPerElt);
49134 else
49135 Val = APInt::getSignedMaxValue(DstBitsPerElt);
49136 } else {
49137 // PACKUS: Truncate signed value with unsigned saturation.
49138 // Source values less than zero are saturated to zero.
49139 // Source values greater than dst maxuint are saturated to maxuint.
49140 if (Val.isIntN(DstBitsPerElt))
49141 Val = Val.trunc(DstBitsPerElt);
49142 else if (Val.isNegative())
49143 Val = APInt::getZero(DstBitsPerElt);
49144 else
49145 Val = APInt::getAllOnes(DstBitsPerElt);
49146 }
49147 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
49148 }
49149 }
49150
49151 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
49152 }
49153
49154 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49155 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49156 return V;
49157
49158 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
49159 // truncate to create a larger truncate.
49160 if (Subtarget.hasAVX512() &&
49161 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
49162 N0.getOperand(0).getValueType() == MVT::v8i32) {
49163 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
49164 (!IsSigned &&
49165 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
49166 if (Subtarget.hasVLX())
49167 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
49168
49169 // Widen input to v16i32 so we can truncate that.
49170 SDLoc dl(N);
49171 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
49172 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
49173 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
49174 }
49175 }
49176
49177 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49178 if (VT.is128BitVector()) {
49179 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49180 SDValue Src0, Src1;
49181 if (N0.getOpcode() == ExtOpc &&
49182 N0.getOperand(0).getValueType().is64BitVector() &&
49183 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49184 Src0 = N0.getOperand(0);
49185 }
49186 if (N1.getOpcode() == ExtOpc &&
49187 N1.getOperand(0).getValueType().is64BitVector() &&
49188 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49189 Src1 = N1.getOperand(0);
49190 }
49191 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
49192 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49192, __extension__
__PRETTY_FUNCTION__))
;
49193 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
49194 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
49195 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
49196 }
49197
49198 // Try again with pack(*_extend_vector_inreg, undef).
49199 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
49200 : ISD::ZERO_EXTEND_VECTOR_INREG;
49201 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
49202 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
49203 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
49204 DAG);
49205 }
49206
49207 // Attempt to combine as shuffle.
49208 SDValue Op(N, 0);
49209 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49210 return Res;
49211
49212 return SDValue();
49213}
49214
49215static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
49216 TargetLowering::DAGCombinerInfo &DCI,
49217 const X86Subtarget &Subtarget) {
49218 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))
49219 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))
49220 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))
;
49221
49222 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
49223 MVT VT = N->getSimpleValueType(0);
49224 SDValue LHS = N->getOperand(0);
49225 SDValue RHS = N->getOperand(1);
49226
49227 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49228 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49229 LHS.getOpcode() == RHS.getOpcode() &&
49230 LHS.getValueType() == RHS.getValueType() &&
49231 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49232 SDValue LHS0 = LHS.getOperand(0);
49233 SDValue LHS1 = LHS.getOperand(1);
49234 SDValue RHS0 = RHS.getOperand(0);
49235 SDValue RHS1 = RHS.getOperand(1);
49236 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
49237 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
49238 SDLoc DL(N);
49239 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
49240 LHS0.isUndef() ? LHS1 : LHS0,
49241 RHS0.isUndef() ? RHS1 : RHS0);
49242 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
49243 Res = DAG.getBitcast(ShufVT, Res);
49244 SDValue NewLHS =
49245 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49246 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
49247 SDValue NewRHS =
49248 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49249 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
49250 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49251 DAG.getBitcast(VT, NewRHS));
49252 }
49253 }
49254 }
49255
49256 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49257 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49258 return V;
49259
49260 return SDValue();
49261}
49262
49263static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
49264 TargetLowering::DAGCombinerInfo &DCI,
49265 const X86Subtarget &Subtarget) {
49266 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))
49267 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))
49268 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))
;
49269 EVT VT = N->getValueType(0);
49270 SDValue N0 = N->getOperand(0);
49271 SDValue N1 = N->getOperand(1);
49272
49273 // Shift zero -> zero.
49274 if (ISD::isBuildVectorAllZeros(N0.getNode()))
49275 return DAG.getConstant(0, SDLoc(N), VT);
49276
49277 // Detect constant shift amounts.
49278 APInt UndefElts;
49279 SmallVector<APInt, 32> EltBits;
49280 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
49281 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49282 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
49283 EltBits[0].getZExtValue(), DAG);
49284 }
49285
49286 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49287 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
49288 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
49289 return SDValue(N, 0);
49290
49291 return SDValue();
49292}
49293
49294static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
49295 TargetLowering::DAGCombinerInfo &DCI,
49296 const X86Subtarget &Subtarget) {
49297 unsigned Opcode = N->getOpcode();
49298 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))
49299 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))
49300 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))
;
49301 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49302 EVT VT = N->getValueType(0);
49303 SDValue N0 = N->getOperand(0);
49304 SDValue N1 = N->getOperand(1);
49305 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49306 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__
__PRETTY_FUNCTION__))
49307 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__
__PRETTY_FUNCTION__))
;
49308 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 &&
"Unexpected shift amount type") ? void (0) : __assert_fail (
"N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49308, __extension__
__PRETTY_FUNCTION__))
;
49309
49310 // (shift undef, X) -> 0
49311 if (N0.isUndef())
49312 return DAG.getConstant(0, SDLoc(N), VT);
49313
49314 // Out of range logical bit shifts are guaranteed to be zero.
49315 // Out of range arithmetic bit shifts splat the sign bit.
49316 unsigned ShiftVal = N->getConstantOperandVal(1);
49317 if (ShiftVal >= NumBitsPerElt) {
49318 if (LogicalShift)
49319 return DAG.getConstant(0, SDLoc(N), VT);
49320 ShiftVal = NumBitsPerElt - 1;
49321 }
49322
49323 // (shift X, 0) -> X
49324 if (!ShiftVal)
49325 return N0;
49326
49327 // (shift 0, C) -> 0
49328 if (ISD::isBuildVectorAllZeros(N0.getNode()))
49329 // N0 is all zeros or undef. We guarantee that the bits shifted into the
49330 // result are all zeros, not undef.
49331 return DAG.getConstant(0, SDLoc(N), VT);
49332
49333 // (VSRAI -1, C) -> -1
49334 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
49335 // N0 is all ones or undef. We guarantee that the bits shifted into the
49336 // result are all ones, not undef.
49337 return DAG.getConstant(-1, SDLoc(N), VT);
49338
49339 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
49340 unsigned NewShiftVal = Amt0 + Amt1;
49341 if (NewShiftVal >= NumBitsPerElt) {
49342 // Out of range logical bit shifts are guaranteed to be zero.
49343 // Out of range arithmetic bit shifts splat the sign bit.
49344 if (LogicalShift)
49345 return DAG.getConstant(0, SDLoc(N), VT);
49346 NewShiftVal = NumBitsPerElt - 1;
49347 }
49348 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
49349 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
49350 };
49351
49352 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49353 if (Opcode == N0.getOpcode())
49354 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
49355
49356 // (shl (add X, X), C) -> (shl X, (C + 1))
49357 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
49358 N0.getOperand(0) == N0.getOperand(1))
49359 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
49360
49361 // We can decode 'whole byte' logical bit shifts as shuffles.
49362 if (LogicalShift && (ShiftVal % 8) == 0) {
49363 SDValue Op(N, 0);
49364 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49365 return Res;
49366 }
49367
49368 auto TryConstantFold = [&](SDValue V) {
49369 APInt UndefElts;
49370 SmallVector<APInt, 32> EltBits;
49371 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))
49372 return SDValue();
49373 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__
__PRETTY_FUNCTION__))
49374 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__
__PRETTY_FUNCTION__))
;
49375 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
49376 // created an undef input due to no input bits being demanded, but user
49377 // still expects 0 in other bits.
49378 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
49379 APInt &Elt = EltBits[i];
49380 if (UndefElts[i])
49381 Elt = 0;
49382 else if (X86ISD::VSHLI == Opcode)
49383 Elt <<= ShiftVal;
49384 else if (X86ISD::VSRAI == Opcode)
49385 Elt.ashrInPlace(ShiftVal);
49386 else
49387 Elt.lshrInPlace(ShiftVal);
49388 }
49389 // Reset undef elements since they were zeroed above.
49390 UndefElts = 0;
49391 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
49392 };
49393
49394 // Constant Folding.
49395 if (N->isOnlyUserOf(N0.getNode())) {
49396 if (SDValue C = TryConstantFold(N0))
49397 return C;
49398
49399 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
49400 // Don't break NOT patterns.
49401 SDValue BC = peekThroughOneUseBitcasts(N0);
49402 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
49403 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
49404 !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {
49405 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
49406 SDLoc DL(N);
49407 SDValue LHS = DAG.getNode(Opcode, DL, VT,
49408 DAG.getBitcast(VT, BC.getOperand(0)), N1);
49409 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
49410 }
49411 }
49412 }
49413
49414 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49415 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
49416 DCI))
49417 return SDValue(N, 0);
49418
49419 return SDValue();
49420}
49421
49422static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
49423 TargetLowering::DAGCombinerInfo &DCI,
49424 const X86Subtarget &Subtarget) {
49425 EVT VT = N->getValueType(0);
49426 unsigned Opcode = N->getOpcode();
49427 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
49428 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
49429 Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
49430 "Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
;
49431
49432 SDValue Vec = N->getOperand(0);
49433 SDValue Scl = N->getOperand(1);
49434 SDValue Idx = N->getOperand(2);
49435
49436 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
49437 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
49438 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
49439
49440 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
49441 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49443 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
49444 APInt::getAllOnes(NumBitsPerElt), DCI))
49445 return SDValue(N, 0);
49446 }
49447
49448 // Attempt to combine insertion patterns to a shuffle.
49449 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
49450 SDValue Op(N, 0);
49451 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49452 return Res;
49453 }
49454
49455 return SDValue();
49456}
49457
49458/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
49459/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
49460/// OR -> CMPNEQSS.
49461static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
49462 TargetLowering::DAGCombinerInfo &DCI,
49463 const X86Subtarget &Subtarget) {
49464 unsigned opcode;
49465
49466 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
49467 // we're requiring SSE2 for both.
49468 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
49469 SDValue N0 = N->getOperand(0);
49470 SDValue N1 = N->getOperand(1);
49471 SDValue CMP0 = N0.getOperand(1);
49472 SDValue CMP1 = N1.getOperand(1);
49473 SDLoc DL(N);
49474
49475 // The SETCCs should both refer to the same CMP.
49476 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
49477 return SDValue();
49478
49479 SDValue CMP00 = CMP0->getOperand(0);
49480 SDValue CMP01 = CMP0->getOperand(1);
49481 EVT VT = CMP00.getValueType();
49482
49483 if (VT == MVT::f32 || VT == MVT::f64 ||
49484 (VT == MVT::f16 && Subtarget.hasFP16())) {
49485 bool ExpectingFlags = false;
49486 // Check for any users that want flags:
49487 for (const SDNode *U : N->uses()) {
49488 if (ExpectingFlags)
49489 break;
49490
49491 switch (U->getOpcode()) {
49492 default:
49493 case ISD::BR_CC:
49494 case ISD::BRCOND:
49495 case ISD::SELECT:
49496 ExpectingFlags = true;
49497 break;
49498 case ISD::CopyToReg:
49499 case ISD::SIGN_EXTEND:
49500 case ISD::ZERO_EXTEND:
49501 case ISD::ANY_EXTEND:
49502 break;
49503 }
49504 }
49505
49506 if (!ExpectingFlags) {
49507 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
49508 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
49509
49510 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
49511 X86::CondCode tmp = cc0;
49512 cc0 = cc1;
49513 cc1 = tmp;
49514 }
49515
49516 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
49517 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
49518 // FIXME: need symbolic constants for these magic numbers.
49519 // See X86ATTInstPrinter.cpp:printSSECC().
49520 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
49521 if (Subtarget.hasAVX512()) {
49522 SDValue FSetCC =
49523 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
49524 DAG.getTargetConstant(x86cc, DL, MVT::i8));
49525 // Need to fill with zeros to ensure the bitcast will produce zeroes
49526 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
49527 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
49528 DAG.getConstant(0, DL, MVT::v16i1),
49529 FSetCC, DAG.getIntPtrConstant(0, DL));
49530 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
49531 N->getSimpleValueType(0));
49532 }
49533 SDValue OnesOrZeroesF =
49534 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
49535 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
49536
49537 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
49538 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
49539
49540 if (is64BitFP && !Subtarget.is64Bit()) {
49541 // On a 32-bit target, we cannot bitcast the 64-bit float to a
49542 // 64-bit integer, since that's not a legal type. Since
49543 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
49544 // bits, but can do this little dance to extract the lowest 32 bits
49545 // and work with those going forward.
49546 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
49547 OnesOrZeroesF);
49548 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
49549 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
49550 Vector32, DAG.getIntPtrConstant(0, DL));
49551 IntVT = MVT::i32;
49552 }
49553
49554 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
49555 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
49556 DAG.getConstant(1, DL, IntVT));
49557 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
49558 ANDed);
49559 return OneBitOfTruth;
49560 }
49561 }
49562 }
49563 }
49564 return SDValue();
49565}
49566
49567/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
49568static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
49569 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49569, __extension__
__PRETTY_FUNCTION__))
;
49570
49571 MVT VT = N->getSimpleValueType(0);
49572 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
49573 return SDValue();
49574
49575 SDValue X, Y;
49576 SDValue N0 = N->getOperand(0);
49577 SDValue N1 = N->getOperand(1);
49578
49579 if (SDValue Not = IsNOT(N0, DAG)) {
49580 X = Not;
49581 Y = N1;
49582 } else if (SDValue Not = IsNOT(N1, DAG)) {
49583 X = Not;
49584 Y = N0;
49585 } else
49586 return SDValue();
49587
49588 X = DAG.getBitcast(VT, X);
49589 Y = DAG.getBitcast(VT, Y);
49590 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
49591}
49592
49593/// Try to fold:
49594/// and (vector_shuffle<Z,...,Z>
49595/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
49596/// ->
49597/// andnp (vector_shuffle<Z,...,Z>
49598/// (insert_vector_elt undef, X, Z), undef), Y
49599static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
49600 const X86Subtarget &Subtarget) {
49601 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49601, __extension__
__PRETTY_FUNCTION__))
;
49602
49603 EVT VT = N->getValueType(0);
49604 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
49605 // value and require extra moves.
49606 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49607 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
49608 return SDValue();
49609
49610 auto GetNot = [&DAG](SDValue V) {
49611 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
49612 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
49613 // end-users are ISD::AND including cases
49614 // (and(extract_vector_element(SVN), Y)).
49615 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
49616 !SVN->getOperand(1).isUndef()) {
49617 return SDValue();
49618 }
49619 SDValue IVEN = SVN->getOperand(0);
49620 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
49621 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
49622 return SDValue();
49623 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
49624 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
49625 return SDValue();
49626 SDValue Src = IVEN.getOperand(1);
49627 if (SDValue Not = IsNOT(Src, DAG)) {
49628 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
49629 SDValue NotIVEN =
49630 DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
49631 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
49632 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
49633 SVN->getOperand(1), SVN->getMask());
49634 }
49635 return SDValue();
49636 };
49637
49638 SDValue X, Y;
49639 SDValue N0 = N->getOperand(0);
49640 SDValue N1 = N->getOperand(1);
49641
49642 if (SDValue Not = GetNot(N0)) {
49643 X = Not;
49644 Y = N1;
49645 } else if (SDValue Not = GetNot(N1)) {
49646 X = Not;
49647 Y = N0;
49648 } else
49649 return SDValue();
49650
49651 X = DAG.getBitcast(VT, X);
49652 Y = DAG.getBitcast(VT, Y);
49653 SDLoc DL(N);
49654 // We do not split for SSE at all, but we need to split vectors for AVX1 and
49655 // AVX2.
49656 if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
49657 SDValue LoX, HiX;
49658 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
49659 SDValue LoY, HiY;
49660 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
49661 EVT SplitVT = LoX.getValueType();
49662 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
49663 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
49664 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
49665 }
49666 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
49667}
49668
49669// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
49670// logical operations, like in the example below.
49671// or (and (truncate x, truncate y)),
49672// (xor (truncate z, build_vector (constants)))
49673// Given a target type \p VT, we generate
49674// or (and x, y), (xor z, zext(build_vector (constants)))
49675// given x, y and z are of type \p VT. We can do so, if operands are either
49676// truncates from VT types, the second operand is a vector of constants or can
49677// be recursively promoted.
49678static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
49679 unsigned Depth) {
49680 // Limit recursion to avoid excessive compile times.
49681 if (Depth >= SelectionDAG::MaxRecursionDepth)
49682 return SDValue();
49683
49684 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
49685 N->getOpcode() != ISD::OR)
49686 return SDValue();
49687
49688 SDValue N0 = N->getOperand(0);
49689 SDValue N1 = N->getOperand(1);
49690 SDLoc DL(N);
49691
49692 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49693 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
49694 return SDValue();
49695
49696 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
49697 N0 = NN0;
49698 else {
49699 // The Left side has to be a trunc.
49700 if (N0.getOpcode() != ISD::TRUNCATE)
49701 return SDValue();
49702
49703 // The type of the truncated inputs.
49704 if (N0.getOperand(0).getValueType() != VT)
49705 return SDValue();
49706
49707 N0 = N0.getOperand(0);
49708 }
49709
49710 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
49711 N1 = NN1;
49712 else {
49713 // The right side has to be a 'trunc' or a constant vector.
49714 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
49715 N1.getOperand(0).getValueType() == VT;
49716 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
49717 return SDValue();
49718
49719 if (RHSTrunc)
49720 N1 = N1.getOperand(0);
49721 else
49722 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
49723 }
49724
49725 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
49726}
49727
49728// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
49729// register. In most cases we actually compare or select YMM-sized registers
49730// and mixing the two types creates horrible code. This method optimizes
49731// some of the transition sequences.
49732// Even with AVX-512 this is still useful for removing casts around logical
49733// operations on vXi1 mask types.
49734static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
49735 const X86Subtarget &Subtarget) {
49736 EVT VT = N->getValueType(0);
49737 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49737, __extension__
__PRETTY_FUNCTION__))
;
49738
49739 SDLoc DL(N);
49740 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))
49741 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))
49742 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))
;
49743
49744 SDValue Narrow = N->getOperand(0);
49745 EVT NarrowVT = Narrow.getValueType();
49746
49747 // Generate the wide operation.
49748 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
49749 if (!Op)
49750 return SDValue();
49751 switch (N->getOpcode()) {
49752 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49752)
;
49753 case ISD::ANY_EXTEND:
49754 return Op;
49755 case ISD::ZERO_EXTEND:
49756 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
49757 case ISD::SIGN_EXTEND:
49758 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
49759 Op, DAG.getValueType(NarrowVT));
49760 }
49761}
49762
49763static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
49764 unsigned FPOpcode;
49765 switch (Opcode) {
49766 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49766)
;
49767 case ISD::AND: FPOpcode = X86ISD::FAND; break;
49768 case ISD::OR: FPOpcode = X86ISD::FOR; break;
49769 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
49770 }
49771 return FPOpcode;
49772}
49773
49774/// If both input operands of a logic op are being cast from floating-point
49775/// types or FP compares, try to convert this into a floating-point logic node
49776/// to avoid unnecessary moves from SSE to integer registers.
49777static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
49778 TargetLowering::DAGCombinerInfo &DCI,
49779 const X86Subtarget &Subtarget) {
49780 EVT VT = N->getValueType(0);
49781 SDValue N0 = N->getOperand(0);
49782 SDValue N1 = N->getOperand(1);
49783 SDLoc DL(N);
49784
49785 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
49786 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
49787 return SDValue();
49788
49789 SDValue N00 = N0.getOperand(0);
49790 SDValue N10 = N1.getOperand(0);
49791 EVT N00Type = N00.getValueType();
49792 EVT N10Type = N10.getValueType();
49793
49794 // Ensure that both types are the same and are legal scalar fp types.
49795 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
49796 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
49797 (Subtarget.hasFP16() && N00Type == MVT::f16)))
49798 return SDValue();
49799
49800 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
49801 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
49802 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
49803 return DAG.getBitcast(VT, FPLogic);
49804 }
49805
49806 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
49807 !N1.hasOneUse())
49808 return SDValue();
49809
49810 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49811 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
49812
49813 // The vector ISA for FP predicates is incomplete before AVX, so converting
49814 // COMIS* to CMPS* may not be a win before AVX.
49815 if (!Subtarget.hasAVX() &&
49816 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
49817 return SDValue();
49818
49819 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
49820 // and vector logic:
49821 // logic (setcc N00, N01), (setcc N10, N11) -->
49822 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
49823 unsigned NumElts = 128 / N00Type.getSizeInBits();
49824 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
49825 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
49826 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
49827 SDValue N01 = N0.getOperand(1);
49828 SDValue N11 = N1.getOperand(1);
49829 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
49830 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
49831 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
49832 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
49833 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
49834 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
49835 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
49836 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
49837}
49838
49839// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
49840// to reduce XMM->GPR traffic.
49841static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
49842 unsigned Opc = N->getOpcode();
49843 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__
__PRETTY_FUNCTION__))
49844 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__
__PRETTY_FUNCTION__))
;
49845
49846 SDValue N0 = N->getOperand(0);
49847 SDValue N1 = N->getOperand(1);
49848
49849 // Both operands must be single use MOVMSK.
49850 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
49851 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
49852 return SDValue();
49853
49854 SDValue Vec0 = N0.getOperand(0);
49855 SDValue Vec1 = N1.getOperand(0);
49856 EVT VecVT0 = Vec0.getValueType();
49857 EVT VecVT1 = Vec1.getValueType();
49858
49859 // Both MOVMSK operands must be from vectors of the same size and same element
49860 // size, but its OK for a fp/int diff.
49861 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
49862 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
49863 return SDValue();
49864
49865 SDLoc DL(N);
49866 unsigned VecOpc =
49867 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
49868 SDValue Result =
49869 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
49870 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49871}
49872
49873// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
49874// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
49875// handles in InstCombine.
49876static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
49877 unsigned Opc = N->getOpcode();
49878 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))
49879 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))
;
49880
49881 SDValue N0 = N->getOperand(0);
49882 SDValue N1 = N->getOperand(1);
49883 EVT VT = N->getValueType(0);
49884
49885 // Both operands must be single use.
49886 if (!N0.hasOneUse() || !N1.hasOneUse())
49887 return SDValue();
49888
49889 // Search for matching shifts.
49890 SDValue BC0 = peekThroughOneUseBitcasts(N0);
49891 SDValue BC1 = peekThroughOneUseBitcasts(N1);
49892
49893 unsigned BCOpc = BC0.getOpcode();
49894 EVT BCVT = BC0.getValueType();
49895 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
49896 return SDValue();
49897
49898 switch (BCOpc) {
49899 case X86ISD::VSHLI:
49900 case X86ISD::VSRLI:
49901 case X86ISD::VSRAI: {
49902 if (BC0.getOperand(1) != BC1.getOperand(1))
49903 return SDValue();
49904
49905 SDLoc DL(N);
49906 SDValue BitOp =
49907 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
49908 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
49909 return DAG.getBitcast(VT, Shift);
49910 }
49911 }
49912
49913 return SDValue();
49914}
49915
49916/// If this is a zero/all-bits result that is bitwise-anded with a low bits
49917/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
49918/// with a shift-right to eliminate loading the vector constant mask value.
49919static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
49920 const X86Subtarget &Subtarget) {
49921 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
49922 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
49923 EVT VT = Op0.getValueType();
49924 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
49925 return SDValue();
49926
49927 // Try to convert an "is positive" signbit masking operation into arithmetic
49928 // shift and "andn". This saves a materialization of a -1 vector constant.
49929 // The "is negative" variant should be handled more generally because it only
49930 // requires "and" rather than "andn":
49931 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
49932 //
49933 // This is limited to the original type to avoid producing even more bitcasts.
49934 // If the bitcasts can't be eliminated, then it is unlikely that this fold
49935 // will be profitable.
49936 if (N->getValueType(0) == VT &&
49937 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
49938 SDValue X, Y;
49939 if (Op1.getOpcode() == X86ISD::PCMPGT &&
49940 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
49941 X = Op1.getOperand(0);
49942 Y = Op0;
49943 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
49944 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
49945 X = Op0.getOperand(0);
49946 Y = Op1;
49947 }
49948 if (X && Y) {
49949 SDLoc DL(N);
49950 SDValue Sra =
49951 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
49952 VT.getScalarSizeInBits() - 1, DAG);
49953 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
49954 }
49955 }
49956
49957 APInt SplatVal;
49958 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
49959 !SplatVal.isMask())
49960 return SDValue();
49961
49962 // Don't prevent creation of ANDN.
49963 if (isBitwiseNot(Op0))
49964 return SDValue();
49965
49966 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
49967 return SDValue();
49968
49969 unsigned EltBitWidth = VT.getScalarSizeInBits();
49970 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
49971 return SDValue();
49972
49973 SDLoc DL(N);
49974 unsigned ShiftVal = SplatVal.countr_one();
49975 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
49976 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
49977 return DAG.getBitcast(N->getValueType(0), Shift);
49978}
49979
49980// Get the index node from the lowered DAG of a GEP IR instruction with one
49981// indexing dimension.
49982static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
49983 if (Ld->isIndexed())
49984 return SDValue();
49985
49986 SDValue Base = Ld->getBasePtr();
49987
49988 if (Base.getOpcode() != ISD::ADD)
49989 return SDValue();
49990
49991 SDValue ShiftedIndex = Base.getOperand(0);
49992
49993 if (ShiftedIndex.getOpcode() != ISD::SHL)
49994 return SDValue();
49995
49996 return ShiftedIndex.getOperand(0);
49997
49998}
49999
50000static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
50001 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
50002 switch (VT.getSizeInBits()) {
50003 default: return false;
50004 case 64: return Subtarget.is64Bit() ? true : false;
50005 case 32: return true;
50006 }
50007 }
50008 return false;
50009}
50010
50011// This function recognizes cases where X86 bzhi instruction can replace and
50012// 'and-load' sequence.
50013// In case of loading integer value from an array of constants which is defined
50014// as follows:
50015//
50016// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50017//
50018// then applying a bitwise and on the result with another input.
50019// It's equivalent to performing bzhi (zero high bits) on the input, with the
50020// same index of the load.
50021static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
50022 const X86Subtarget &Subtarget) {
50023 MVT VT = Node->getSimpleValueType(0);
50024 SDLoc dl(Node);
50025
50026 // Check if subtarget has BZHI instruction for the node's type
50027 if (!hasBZHI(Subtarget, VT))
50028 return SDValue();
50029
50030 // Try matching the pattern for both operands.
50031 for (unsigned i = 0; i < 2; i++) {
50032 SDValue N = Node->getOperand(i);
50033 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
50034
50035 // continue if the operand is not a load instruction
50036 if (!Ld)
50037 return SDValue();
50038
50039 const Value *MemOp = Ld->getMemOperand()->getValue();
50040
50041 if (!MemOp)
50042 return SDValue();
50043
50044 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
50045 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50046 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50047
50048 Constant *Init = GV->getInitializer();
50049 Type *Ty = Init->getType();
50050 if (!isa<ConstantDataArray>(Init) ||
50051 !Ty->getArrayElementType()->isIntegerTy() ||
50052 Ty->getArrayElementType()->getScalarSizeInBits() !=
50053 VT.getSizeInBits() ||
50054 Ty->getArrayNumElements() >
50055 Ty->getArrayElementType()->getScalarSizeInBits())
50056 continue;
50057
50058 // Check if the array's constant elements are suitable to our case.
50059 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50060 bool ConstantsMatch = true;
50061 for (uint64_t j = 0; j < ArrayElementCount; j++) {
50062 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50063 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50064 ConstantsMatch = false;
50065 break;
50066 }
50067 }
50068 if (!ConstantsMatch)
50069 continue;
50070
50071 // Do the transformation (For 32-bit type):
50072 // -> (and (load arr[idx]), inp)
50073 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50074 // that will be replaced with one bzhi instruction.
50075 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
50076 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
50077
50078 // Get the Node which indexes into the array.
50079 SDValue Index = getIndexFromUnindexedLoad(Ld);
50080 if (!Index)
50081 return SDValue();
50082 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
50083
50084 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
50085 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
50086
50087 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
50088 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
50089
50090 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
50091 }
50092 }
50093 }
50094 }
50095 return SDValue();
50096}
50097
50098// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
50099// Where C is a mask containing the same number of bits as the setcc and
50100// where the setcc will freely 0 upper bits of k-register. We can replace the
50101// undef in the concat with 0s and remove the AND. This mainly helps with
50102// v2i1/v4i1 setcc being casted to scalar.
50103static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
50104 const X86Subtarget &Subtarget) {
50105 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50105, __extension__
__PRETTY_FUNCTION__))
;
50106
50107 EVT VT = N->getValueType(0);
50108
50109 // Make sure this is an AND with constant. We will check the value of the
50110 // constant later.
50111 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50112 if (!C1)
50113 return SDValue();
50114
50115 // This is implied by the ConstantSDNode.
50116 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50116, __extension__
__PRETTY_FUNCTION__))
;
50117
50118 SDValue Src = N->getOperand(0);
50119 if (!Src.hasOneUse())
50120 return SDValue();
50121
50122 // (Optionally) peek through any_extend().
50123 if (Src.getOpcode() == ISD::ANY_EXTEND) {
50124 if (!Src.getOperand(0).hasOneUse())
50125 return SDValue();
50126 Src = Src.getOperand(0);
50127 }
50128
50129 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
50130 return SDValue();
50131
50132 Src = Src.getOperand(0);
50133 EVT SrcVT = Src.getValueType();
50134
50135 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50136 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
50137 !TLI.isTypeLegal(SrcVT))
50138 return SDValue();
50139
50140 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
50141 return SDValue();
50142
50143 // We only care about the first subvector of the concat, we expect the
50144 // other subvectors to be ignored due to the AND if we make the change.
50145 SDValue SubVec = Src.getOperand(0);
50146 EVT SubVecVT = SubVec.getValueType();
50147
50148 // The RHS of the AND should be a mask with as many bits as SubVec.
50149 if (!TLI.isTypeLegal(SubVecVT) ||
50150 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50151 return SDValue();
50152
50153 // First subvector should be a setcc with a legal result type or a
50154 // AND containing at least one setcc with a legal result type.
50155 auto IsLegalSetCC = [&](SDValue V) {
50156 if (V.getOpcode() != ISD::SETCC)
50157 return false;
50158 EVT SetccVT = V.getOperand(0).getValueType();
50159 if (!TLI.isTypeLegal(SetccVT) ||
50160 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
50161 return false;
50162 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
50163 return false;
50164 return true;
50165 };
50166 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
50167 (IsLegalSetCC(SubVec.getOperand(0)) ||
50168 IsLegalSetCC(SubVec.getOperand(1))))))
50169 return SDValue();
50170
50171 // We passed all the checks. Rebuild the concat_vectors with zeroes
50172 // and cast it back to VT.
50173 SDLoc dl(N);
50174 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
50175 DAG.getConstant(0, dl, SubVecVT));
50176 Ops[0] = SubVec;
50177 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
50178 Ops);
50179 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
50180 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
50181}
50182
50183static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
50184 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50185 // We don't want to go crazy with the recursion here. This isn't a super
50186 // important optimization.
50187 static constexpr unsigned kMaxDepth = 2;
50188
50189 // Only do this re-ordering if op has one use.
50190 if (!Op.hasOneUse())
50191 return SDValue();
50192
50193 SDLoc DL(Op);
50194 // If we hit another assosiative op, recurse further.
50195 if (Op.getOpcode() == Opc) {
50196 // Done recursing.
50197 if (Depth++ >= kMaxDepth)
50198 return SDValue();
50199
50200 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50201 if (SDValue R =
50202 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50203 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
50204 Op.getOperand(1 - OpIdx));
50205
50206 } else if (Op.getOpcode() == ISD::SUB) {
50207 if (Opc == ISD::AND) {
50208 // BLSI: (and x, (sub 0, x))
50209 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
50210 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50211 }
50212 // Opc must be ISD::AND or ISD::XOR
50213 // BLSR: (and x, (sub x, 1))
50214 // BLSMSK: (xor x, (sub x, 1))
50215 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50216 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50217
50218 } else if (Op.getOpcode() == ISD::ADD) {
50219 // Opc must be ISD::AND or ISD::XOR
50220 // BLSR: (and x, (add x, -1))
50221 // BLSMSK: (xor x, (add x, -1))
50222 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50223 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50224 }
50225 return SDValue();
50226}
50227
50228static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
50229 const X86Subtarget &Subtarget) {
50230 EVT VT = N->getValueType(0);
50231 // Make sure this node is a candidate for BMI instructions.
50232 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
50233 (VT != MVT::i32 && VT != MVT::i64))
50234 return SDValue();
50235
50236 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50236, __extension__
__PRETTY_FUNCTION__))
;
50237
50238 // Try and match LHS and RHS.
50239 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50240 if (SDValue OpMatch =
50241 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50242 N->getOperand(1 - OpIdx), 0))
50243 return OpMatch;
50244 return SDValue();
50245}
50246
50247static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
50248 TargetLowering::DAGCombinerInfo &DCI,
50249 const X86Subtarget &Subtarget) {
50250 SDValue N0 = N->getOperand(0);
50251 SDValue N1 = N->getOperand(1);
50252 EVT VT = N->getValueType(0);
50253 SDLoc dl(N);
50254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50255
50256 // If this is SSE1 only convert to FAND to avoid scalarization.
50257 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50258 return DAG.getBitcast(MVT::v4i32,
50259 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
50260 DAG.getBitcast(MVT::v4f32, N0),
50261 DAG.getBitcast(MVT::v4f32, N1)));
50262 }
50263
50264 // Use a 32-bit and+zext if upper bits known zero.
50265 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
50266 APInt HiMask = APInt::getHighBitsSet(64, 32);
50267 if (DAG.MaskedValueIsZero(N1, HiMask) ||
50268 DAG.MaskedValueIsZero(N0, HiMask)) {
50269 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
50270 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
50271 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
50272 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
50273 }
50274 }
50275
50276 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
50277 // TODO: Support multiple SrcOps.
50278 if (VT == MVT::i1) {
50279 SmallVector<SDValue, 2> SrcOps;
50280 SmallVector<APInt, 2> SrcPartials;
50281 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
50282 SrcOps.size() == 1) {
50283 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50284 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50285 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50286 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50287 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50288 if (Mask) {
50289 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__
__PRETTY_FUNCTION__))
50290 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__
__PRETTY_FUNCTION__))
;
50291 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50292 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50293 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
50294 }
50295 }
50296 }
50297
50298 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
50299 return V;
50300
50301 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50302 return R;
50303
50304 if (SDValue R = combineBitOpWithShift(N, DAG))
50305 return R;
50306
50307 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50308 return FPLogic;
50309
50310 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
50311 return R;
50312
50313 if (DCI.isBeforeLegalizeOps())
50314 return SDValue();
50315
50316 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50317 return R;
50318
50319 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
50320 return R;
50321
50322 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
50323 return ShiftRight;
50324
50325 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
50326 return R;
50327
50328 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
50329 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
50330 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
50331 if (VT.isVector() && getTargetConstantFromNode(N1)) {
50332 unsigned Opc0 = N0.getOpcode();
50333 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
50334 getTargetConstantFromNode(N0.getOperand(1)) &&
50335 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
50336 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
50337 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
50338 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
50339 }
50340 }
50341
50342 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
50343 // avoids slow variable shift (moving shift amount to ECX etc.)
50344 if (isOneConstant(N1) && N0->hasOneUse()) {
50345 SDValue Src = N0;
50346 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
50347 Src.getOpcode() == ISD::TRUNCATE) &&
50348 Src.getOperand(0)->hasOneUse())
50349 Src = Src.getOperand(0);
50350 bool ContainsNOT = false;
50351 X86::CondCode X86CC = X86::COND_B;
50352 // Peek through AND(NOT(SRL(X,Y)),1).
50353 if (isBitwiseNot(Src)) {
50354 Src = Src.getOperand(0);
50355 X86CC = X86::COND_AE;
50356 ContainsNOT = true;
50357 }
50358 if (Src.getOpcode() == ISD::SRL &&
50359 !isa<ConstantSDNode>(Src.getOperand(1))) {
50360 SDValue BitNo = Src.getOperand(1);
50361 Src = Src.getOperand(0);
50362 // Peek through AND(SRL(NOT(X),Y),1).
50363 if (isBitwiseNot(Src)) {
50364 Src = Src.getOperand(0);
50365 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
50366 ContainsNOT = true;
50367 }
50368 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
50369 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
50370 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
50371 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
50372 }
50373 }
50374
50375 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50376 // Attempt to recursively combine a bitmask AND with shuffles.
50377 SDValue Op(N, 0);
50378 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50379 return Res;
50380
50381 // If either operand is a constant mask, then only the elements that aren't
50382 // zero are actually demanded by the other operand.
50383 auto GetDemandedMasks = [&](SDValue Op) {
50384 APInt UndefElts;
50385 SmallVector<APInt> EltBits;
50386 int NumElts = VT.getVectorNumElements();
50387 int EltSizeInBits = VT.getScalarSizeInBits();
50388 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50389 APInt DemandedElts = APInt::getAllOnes(NumElts);
50390 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50391 EltBits)) {
50392 DemandedBits.clearAllBits();
50393 DemandedElts.clearAllBits();
50394 for (int I = 0; I != NumElts; ++I) {
50395 if (UndefElts[I]) {
50396 // We can't assume an undef src element gives an undef dst - the
50397 // other src might be zero.
50398 DemandedBits.setAllBits();
50399 DemandedElts.setBit(I);
50400 } else if (!EltBits[I].isZero()) {
50401 DemandedBits |= EltBits[I];
50402 DemandedElts.setBit(I);
50403 }
50404 }
50405 }
50406 return std::make_pair(DemandedBits, DemandedElts);
50407 };
50408 APInt Bits0, Elts0;
50409 APInt Bits1, Elts1;
50410 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
50411 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
50412
50413 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
50414 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
50415 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
50416 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
50417 if (N->getOpcode() != ISD::DELETED_NODE)
50418 DCI.AddToWorklist(N);
50419 return SDValue(N, 0);
50420 }
50421
50422 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
50423 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
50424 if (NewN0 || NewN1)
50425 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
50426 NewN1 ? NewN1 : N1);
50427 }
50428
50429 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
50430 if ((VT.getScalarSizeInBits() % 8) == 0 &&
50431 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50432 isa<ConstantSDNode>(N0.getOperand(1))) {
50433 SDValue BitMask = N1;
50434 SDValue SrcVec = N0.getOperand(0);
50435 EVT SrcVecVT = SrcVec.getValueType();
50436
50437 // Check that the constant bitmask masks whole bytes.
50438 APInt UndefElts;
50439 SmallVector<APInt, 64> EltBits;
50440 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
50441 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
50442 llvm::all_of(EltBits, [](const APInt &M) {
50443 return M.isZero() || M.isAllOnes();
50444 })) {
50445 unsigned NumElts = SrcVecVT.getVectorNumElements();
50446 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
50447 unsigned Idx = N0.getConstantOperandVal(1);
50448
50449 // Create a root shuffle mask from the byte mask and the extracted index.
50450 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
50451 for (unsigned i = 0; i != Scale; ++i) {
50452 if (UndefElts[i])
50453 continue;
50454 int VecIdx = Scale * Idx + i;
50455 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
50456 }
50457
50458 if (SDValue Shuffle = combineX86ShufflesRecursively(
50459 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
50460 X86::MaxShuffleCombineDepth,
50461 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
50462 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
50463 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
50464 N0.getOperand(1));
50465 }
50466 }
50467
50468 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
50469 return R;
50470
50471 return SDValue();
50472}
50473
50474// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
50475static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
50476 const X86Subtarget &Subtarget) {
50477 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50477, __extension__
__PRETTY_FUNCTION__))
;
50478
50479 MVT VT = N->getSimpleValueType(0);
50480 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50481 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
50482 return SDValue();
50483
50484 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
50485 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
50486 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
50487 return SDValue();
50488
50489 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
50490 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
50491 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
50492 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
50493 return SDValue();
50494
50495 // Attempt to extract constant byte masks.
50496 APInt UndefElts0, UndefElts1;
50497 SmallVector<APInt, 32> EltBits0, EltBits1;
50498 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
50499 false, false))
50500 return SDValue();
50501 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
50502 false, false))
50503 return SDValue();
50504
50505 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
50506 // TODO - add UNDEF elts support.
50507 if (UndefElts0[i] || UndefElts1[i])
50508 return SDValue();
50509 if (EltBits0[i] != ~EltBits1[i])
50510 return SDValue();
50511 }
50512
50513 SDLoc DL(N);
50514
50515 if (useVPTERNLOG(Subtarget, VT)) {
50516 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
50517 // VPTERNLOG is only available as vXi32/64-bit types.
50518 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
50519 MVT OpVT =
50520 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
50521 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
50522 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
50523 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
50524 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
50525 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
50526 DAG, Subtarget);
50527 return DAG.getBitcast(VT, Res);
50528 }
50529
50530 SDValue X = N->getOperand(0);
50531 SDValue Y =
50532 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
50533 DAG.getBitcast(VT, N1.getOperand(0)));
50534 return DAG.getNode(ISD::OR, DL, VT, X, Y);
50535}
50536
50537// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
50538static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
50539 if (N->getOpcode() != ISD::OR)
50540 return false;
50541
50542 SDValue N0 = N->getOperand(0);
50543 SDValue N1 = N->getOperand(1);
50544
50545 // Canonicalize AND to LHS.
50546 if (N1.getOpcode() == ISD::AND)
50547 std::swap(N0, N1);
50548
50549 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
50550 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
50551 return false;
50552
50553 Mask = N1.getOperand(0);
50554 X = N1.getOperand(1);
50555
50556 // Check to see if the mask appeared in both the AND and ANDNP.
50557 if (N0.getOperand(0) == Mask)
50558 Y = N0.getOperand(1);
50559 else if (N0.getOperand(1) == Mask)
50560 Y = N0.getOperand(0);
50561 else
50562 return false;
50563
50564 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
50565 // ANDNP combine allows other combines to happen that prevent matching.
50566 return true;
50567}
50568
50569// Try to fold:
50570// (or (and (m, y), (pandn m, x)))
50571// into:
50572// (vselect m, x, y)
50573// As a special case, try to fold:
50574// (or (and (m, (sub 0, x)), (pandn m, x)))
50575// into:
50576// (sub (xor X, M), M)
50577static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
50578 const X86Subtarget &Subtarget) {
50579 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50579, __extension__
__PRETTY_FUNCTION__))
;
50580
50581 EVT VT = N->getValueType(0);
50582 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50583 (VT.is256BitVector() && Subtarget.hasInt256())))
50584 return SDValue();
50585
50586 SDValue X, Y, Mask;
50587 if (!matchLogicBlend(N, X, Y, Mask))
50588 return SDValue();
50589
50590 // Validate that X, Y, and Mask are bitcasts, and see through them.
50591 Mask = peekThroughBitcasts(Mask);
50592 X = peekThroughBitcasts(X);
50593 Y = peekThroughBitcasts(Y);
50594
50595 EVT MaskVT = Mask.getValueType();
50596 unsigned EltBits = MaskVT.getScalarSizeInBits();
50597
50598 // TODO: Attempt to handle floating point cases as well?
50599 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
50600 return SDValue();
50601
50602 SDLoc DL(N);
50603
50604 // Attempt to combine to conditional negate: (sub (xor X, M), M)
50605 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
50606 DAG, Subtarget))
50607 return Res;
50608
50609 // PBLENDVB is only available on SSE 4.1.
50610 if (!Subtarget.hasSSE41())
50611 return SDValue();
50612
50613 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
50614 if (Subtarget.hasVLX())
50615 return SDValue();
50616
50617 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
50618
50619 X = DAG.getBitcast(BlendVT, X);
50620 Y = DAG.getBitcast(BlendVT, Y);
50621 Mask = DAG.getBitcast(BlendVT, Mask);
50622 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
50623 return DAG.getBitcast(VT, Mask);
50624}
50625
50626// Helper function for combineOrCmpEqZeroToCtlzSrl
50627// Transforms:
50628// seteq(cmp x, 0)
50629// into:
50630// srl(ctlz x), log2(bitsize(x))
50631// Input pattern is checked by caller.
50632static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
50633 SDValue Cmp = Op.getOperand(1);
50634 EVT VT = Cmp.getOperand(0).getValueType();
50635 unsigned Log2b = Log2_32(VT.getSizeInBits());
50636 SDLoc dl(Op);
50637 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50638 // The result of the shift is true or false, and on X86, the 32-bit
50639 // encoding of shr and lzcnt is more desirable.
50640 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
50641 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
50642 DAG.getConstant(Log2b, dl, MVT::i8));
50643 return Scc;
50644}
50645
50646// Try to transform:
50647// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
50648// into:
50649// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
50650// Will also attempt to match more generic cases, eg:
50651// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
50652// Only applies if the target supports the FastLZCNT feature.
50653static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
50654 TargetLowering::DAGCombinerInfo &DCI,
50655 const X86Subtarget &Subtarget) {
50656 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50657 return SDValue();
50658
50659 auto isORCandidate = [](SDValue N) {
50660 return (N->getOpcode() == ISD::OR && N->hasOneUse());
50661 };
50662
50663 // Check the zero extend is extending to 32-bit or more. The code generated by
50664 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
50665 // instructions to clear the upper bits.
50666 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
50667 !isORCandidate(N->getOperand(0)))
50668 return SDValue();
50669
50670 // Check the node matches: setcc(eq, cmp 0)
50671 auto isSetCCCandidate = [](SDValue N) {
50672 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
50673 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
50674 N->getOperand(1).getOpcode() == X86ISD::CMP &&
50675 isNullConstant(N->getOperand(1).getOperand(1)) &&
50676 N->getOperand(1).getValueType().bitsGE(MVT::i32);
50677 };
50678
50679 SDNode *OR = N->getOperand(0).getNode();
50680 SDValue LHS = OR->getOperand(0);
50681 SDValue RHS = OR->getOperand(1);
50682
50683 // Save nodes matching or(or, setcc(eq, cmp 0)).
50684 SmallVector<SDNode *, 2> ORNodes;
50685 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
50686 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
50687 ORNodes.push_back(OR);
50688 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
50689 LHS = OR->getOperand(0);
50690 RHS = OR->getOperand(1);
50691 }
50692
50693 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
50694 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
50695 !isORCandidate(SDValue(OR, 0)))
50696 return SDValue();
50697
50698 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
50699 // to
50700 // or(srl(ctlz),srl(ctlz)).
50701 // The dag combiner can then fold it into:
50702 // srl(or(ctlz, ctlz)).
50703 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
50704 SDValue Ret, NewRHS;
50705 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
50706 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
50707
50708 if (!Ret)
50709 return SDValue();
50710
50711 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
50712 while (!ORNodes.empty()) {
50713 OR = ORNodes.pop_back_val();
50714 LHS = OR->getOperand(0);
50715 RHS = OR->getOperand(1);
50716 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
50717 if (RHS->getOpcode() == ISD::OR)
50718 std::swap(LHS, RHS);
50719 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
50720 if (!NewRHS)
50721 return SDValue();
50722 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
50723 }
50724
50725 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
50726}
50727
50728static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
50729 SDValue And1_L, SDValue And1_R,
50730 const SDLoc &DL, SelectionDAG &DAG) {
50731 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
50732 return SDValue();
50733 SDValue NotOp = And0_L->getOperand(0);
50734 if (NotOp == And1_R)
50735 std::swap(And1_R, And1_L);
50736 if (NotOp != And1_L)
50737 return SDValue();
50738
50739 // (~(NotOp) & And0_R) | (NotOp & And1_R)
50740 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
50741 EVT VT = And1_L->getValueType(0);
50742 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
50743 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
50744 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
50745 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
50746 return Xor1;
50747}
50748
50749/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
50750/// equivalent `((x ^ y) & m) ^ y)` pattern.
50751/// This is typically a better representation for targets without a fused
50752/// "and-not" operation. This function is intended to be called from a
50753/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
50754static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
50755 // Note that masked-merge variants using XOR or ADD expressions are
50756 // normalized to OR by InstCombine so we only check for OR.
50757 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50757, __extension__
__PRETTY_FUNCTION__))
;
50758 SDValue N0 = Node->getOperand(0);
50759 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
50760 return SDValue();
50761 SDValue N1 = Node->getOperand(1);
50762 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
50763 return SDValue();
50764
50765 SDLoc DL(Node);
50766 SDValue N00 = N0->getOperand(0);
50767 SDValue N01 = N0->getOperand(1);
50768 SDValue N10 = N1->getOperand(0);
50769 SDValue N11 = N1->getOperand(1);
50770 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
50771 return Result;
50772 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
50773 return Result;
50774 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
50775 return Result;
50776 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
50777 return Result;
50778 return SDValue();
50779}
50780
50781/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50782/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50783/// with CMP+{ADC, SBB}.
50784/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
50785static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
50786 SDValue X, SDValue Y,
50787 SelectionDAG &DAG,
50788 bool ZeroSecondOpOnly = false) {
50789 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
50790 return SDValue();
50791
50792 // Look through a one-use zext.
50793 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
50794 Y = Y.getOperand(0);
50795
50796 X86::CondCode CC;
50797 SDValue EFLAGS;
50798 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
50799 CC = (X86::CondCode)Y.getConstantOperandVal(0);
50800 EFLAGS = Y.getOperand(1);
50801 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
50802 Y.hasOneUse()) {
50803 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
50804 }
50805
50806 if (!EFLAGS)
50807 return SDValue();
50808
50809 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50810 // the general case below.
50811 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50812 if (ConstantX && !ZeroSecondOpOnly) {
50813 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
50814 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
50815 // This is a complicated way to get -1 or 0 from the carry flag:
50816 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50817 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50818 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50819 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50820 EFLAGS);
50821 }
50822
50823 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
50824 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
50825 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50826 EFLAGS.getValueType().isInteger() &&
50827 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50828 // Swap the operands of a SUB, and we have the same pattern as above.
50829 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50830 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50831 SDValue NewSub = DAG.getNode(
50832 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50833 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50834 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50835 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50836 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50837 NewEFLAGS);
50838 }
50839 }
50840 }
50841
50842 if (CC == X86::COND_B) {
50843 // X + SETB Z --> adc X, 0
50844 // X - SETB Z --> sbb X, 0
50845 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50846 DAG.getVTList(VT, MVT::i32), X,
50847 DAG.getConstant(0, DL, VT), EFLAGS);
50848 }
50849
50850 if (ZeroSecondOpOnly)
50851 return SDValue();
50852
50853 if (CC == X86::COND_A) {
50854 // Try to convert COND_A into COND_B in an attempt to facilitate
50855 // materializing "setb reg".
50856 //
50857 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50858 // cannot take an immediate as its first operand.
50859 //
50860 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50861 EFLAGS.getValueType().isInteger() &&
50862 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50863 SDValue NewSub =
50864 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50865 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50866 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50867 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50868 DAG.getVTList(VT, MVT::i32), X,
50869 DAG.getConstant(0, DL, VT), NewEFLAGS);
50870 }
50871 }
50872
50873 if (CC == X86::COND_AE) {
50874 // X + SETAE --> sbb X, -1
50875 // X - SETAE --> adc X, -1
50876 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50877 DAG.getVTList(VT, MVT::i32), X,
50878 DAG.getConstant(-1, DL, VT), EFLAGS);
50879 }
50880
50881 if (CC == X86::COND_BE) {
50882 // X + SETBE --> sbb X, -1
50883 // X - SETBE --> adc X, -1
50884 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50885 // materializing "setae reg".
50886 //
50887 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50888 // cannot take an immediate as its first operand.
50889 //
50890 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50891 EFLAGS.getValueType().isInteger() &&
50892 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50893 SDValue NewSub =
50894 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50895 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50896 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50897 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50898 DAG.getVTList(VT, MVT::i32), X,
50899 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50900 }
50901 }
50902
50903 if (CC != X86::COND_E && CC != X86::COND_NE)
50904 return SDValue();
50905
50906 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
50907 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
50908 !EFLAGS.getOperand(0).getValueType().isInteger())
50909 return SDValue();
50910
50911 SDValue Z = EFLAGS.getOperand(0);
50912 EVT ZVT = Z.getValueType();
50913
50914 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50915 // the general case below.
50916 if (ConstantX) {
50917 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50918 // fake operands:
50919 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50920 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50921 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
50922 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
50923 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50924 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50925 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50926 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50927 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50928 SDValue(Neg.getNode(), 1));
50929 }
50930
50931 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50932 // with fake operands:
50933 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50934 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50935 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
50936 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
50937 SDValue One = DAG.getConstant(1, DL, ZVT);
50938 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50939 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50940 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50941 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50942 Cmp1.getValue(1));
50943 }
50944 }
50945
50946 // (cmp Z, 1) sets the carry flag if Z is 0.
50947 SDValue One = DAG.getConstant(1, DL, ZVT);
50948 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50949 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50950
50951 // Add the flags type for ADC/SBB nodes.
50952 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50953
50954 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50955 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50956 if (CC == X86::COND_NE)
50957 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50958 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50959
50960 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50961 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50962 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50963 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50964}
50965
50966/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50967/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50968/// with CMP+{ADC, SBB}.
50969static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
50970 bool IsSub = N->getOpcode() == ISD::SUB;
50971 SDValue X = N->getOperand(0);
50972 SDValue Y = N->getOperand(1);
50973 EVT VT = N->getValueType(0);
50974 SDLoc DL(N);
50975
50976 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
50977 return ADCOrSBB;
50978
50979 // Commute and try again (negate the result for subtracts).
50980 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
50981 if (IsSub)
50982 ADCOrSBB =
50983 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
50984 return ADCOrSBB;
50985 }
50986
50987 return SDValue();
50988}
50989
50990static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
50991 SelectionDAG &DAG) {
50992 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__
__PRETTY_FUNCTION__))
50993 "Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__
__PRETTY_FUNCTION__))
;
50994
50995 // Delegate to combineAddOrSubToADCOrSBB if we have:
50996 //
50997 // (xor/or (zero_extend (setcc)) imm)
50998 //
50999 // where imm is odd if and only if we have xor, in which case the XOR/OR are
51000 // equivalent to a SUB/ADD, respectively.
51001 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
51002 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
51003 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
51004 bool IsSub = N->getOpcode() == ISD::XOR;
51005 bool N1COdd = N1C->getZExtValue() & 1;
51006 if (IsSub ? N1COdd : !N1COdd) {
51007 SDLoc DL(N);
51008 EVT VT = N->getValueType(0);
51009 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
51010 return R;
51011 }
51012 }
51013 }
51014
51015 return SDValue();
51016}
51017
51018static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
51019 TargetLowering::DAGCombinerInfo &DCI,
51020 const X86Subtarget &Subtarget) {
51021 SDValue N0 = N->getOperand(0);
51022 SDValue N1 = N->getOperand(1);
51023 EVT VT = N->getValueType(0);
51024 SDLoc dl(N);
51025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51026
51027 // If this is SSE1 only convert to FOR to avoid scalarization.
51028 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51029 return DAG.getBitcast(MVT::v4i32,
51030 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
51031 DAG.getBitcast(MVT::v4f32, N0),
51032 DAG.getBitcast(MVT::v4f32, N1)));
51033 }
51034
51035 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51036 // TODO: Support multiple SrcOps.
51037 if (VT == MVT::i1) {
51038 SmallVector<SDValue, 2> SrcOps;
51039 SmallVector<APInt, 2> SrcPartials;
51040 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
51041 SrcOps.size() == 1) {
51042 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51043 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51044 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51045 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51046 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51047 if (Mask) {
51048 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__
__PRETTY_FUNCTION__))
51049 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__
__PRETTY_FUNCTION__))
;
51050 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
51051 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51052 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51053 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
51054 }
51055 }
51056 }
51057
51058 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
51059 return R;
51060
51061 if (SDValue R = combineBitOpWithShift(N, DAG))
51062 return R;
51063
51064 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
51065 return FPLogic;
51066
51067 if (DCI.isBeforeLegalizeOps())
51068 return SDValue();
51069
51070 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51071 return R;
51072
51073 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
51074 return R;
51075
51076 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
51077 return R;
51078
51079 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
51080 if ((VT == MVT::i32 || VT == MVT::i64) &&
51081 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
51082 isNullConstant(N0.getOperand(0))) {
51083 SDValue Cond = N0.getOperand(1);
51084 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
51085 Cond = Cond.getOperand(0);
51086
51087 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
51088 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
51089 uint64_t Val = CN->getZExtValue();
51090 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
51091 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
51092 CCode = X86::GetOppositeBranchCondition(CCode);
51093 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
51094
51095 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
51096 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
51097 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
51098 return R;
51099 }
51100 }
51101 }
51102 }
51103
51104 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
51105 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
51106 // iff the upper elements of the non-shifted arg are zero.
51107 // KUNPCK require 16+ bool vector elements.
51108 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
51109 unsigned NumElts = VT.getVectorNumElements();
51110 unsigned HalfElts = NumElts / 2;
51111 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
51112 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
51113 N1.getConstantOperandAPInt(1) == HalfElts &&
51114 DAG.MaskedVectorIsZero(N0, UpperElts)) {
51115 return DAG.getNode(
51116 ISD::CONCAT_VECTORS, dl, VT,
51117 extractSubVector(N0, 0, DAG, dl, HalfElts),
51118 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
51119 }
51120 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
51121 N0.getConstantOperandAPInt(1) == HalfElts &&
51122 DAG.MaskedVectorIsZero(N1, UpperElts)) {
51123 return DAG.getNode(
51124 ISD::CONCAT_VECTORS, dl, VT,
51125 extractSubVector(N1, 0, DAG, dl, HalfElts),
51126 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
51127 }
51128 }
51129
51130 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51131 // Attempt to recursively combine an OR of shuffles.
51132 SDValue Op(N, 0);
51133 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51134 return Res;
51135
51136 // If either operand is a constant mask, then only the elements that aren't
51137 // allones are actually demanded by the other operand.
51138 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
51139 APInt UndefElts;
51140 SmallVector<APInt> EltBits;
51141 int NumElts = VT.getVectorNumElements();
51142 int EltSizeInBits = VT.getScalarSizeInBits();
51143 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
51144 return false;
51145
51146 APInt DemandedElts = APInt::getZero(NumElts);
51147 for (int I = 0; I != NumElts; ++I)
51148 if (!EltBits[I].isAllOnes())
51149 DemandedElts.setBit(I);
51150
51151 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
51152 };
51153 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
51154 if (N->getOpcode() != ISD::DELETED_NODE)
51155 DCI.AddToWorklist(N);
51156 return SDValue(N, 0);
51157 }
51158 }
51159
51160 // We should fold "masked merge" patterns when `andn` is not available.
51161 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
51162 if (SDValue R = foldMaskedMerge(N, DAG))
51163 return R;
51164
51165 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
51166 return R;
51167
51168 return SDValue();
51169}
51170
51171/// Try to turn tests against the signbit in the form of:
51172/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
51173/// into:
51174/// SETGT(X, -1)
51175static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
51176 // This is only worth doing if the output type is i8 or i1.
51177 EVT ResultType = N->getValueType(0);
51178 if (ResultType != MVT::i8 && ResultType != MVT::i1)
51179 return SDValue();
51180
51181 SDValue N0 = N->getOperand(0);
51182 SDValue N1 = N->getOperand(1);
51183
51184 // We should be performing an xor against a truncated shift.
51185 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
51186 return SDValue();
51187
51188 // Make sure we are performing an xor against one.
51189 if (!isOneConstant(N1))
51190 return SDValue();
51191
51192 // SetCC on x86 zero extends so only act on this if it's a logical shift.
51193 SDValue Shift = N0.getOperand(0);
51194 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
51195 return SDValue();
51196
51197 // Make sure we are truncating from one of i16, i32 or i64.
51198 EVT ShiftTy = Shift.getValueType();
51199 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
51200 return SDValue();
51201
51202 // Make sure the shift amount extracts the sign bit.
51203 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
51204 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
51205 return SDValue();
51206
51207 // Create a greater-than comparison against -1.
51208 // N.B. Using SETGE against 0 works but we want a canonical looking
51209 // comparison, using SETGT matches up with what TranslateX86CC.
51210 SDLoc DL(N);
51211 SDValue ShiftOp = Shift.getOperand(0);
51212 EVT ShiftOpTy = ShiftOp.getValueType();
51213 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51214 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
51215 *DAG.getContext(), ResultType);
51216 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
51217 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
51218 if (SetCCResultType != ResultType)
51219 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
51220 return Cond;
51221}
51222
51223/// Turn vector tests of the signbit in the form of:
51224/// xor (sra X, elt_size(X)-1), -1
51225/// into:
51226/// pcmpgt X, -1
51227///
51228/// This should be called before type legalization because the pattern may not
51229/// persist after that.
51230static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
51231 const X86Subtarget &Subtarget) {
51232 EVT VT = N->getValueType(0);
51233 if (!VT.isSimple())
51234 return SDValue();
51235
51236 switch (VT.getSimpleVT().SimpleTy) {
51237 default: return SDValue();
51238 case MVT::v16i8:
51239 case MVT::v8i16:
51240 case MVT::v4i32:
51241 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
51242 case MVT::v32i8:
51243 case MVT::v16i16:
51244 case MVT::v8i32:
51245 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
51246 }
51247
51248 // There must be a shift right algebraic before the xor, and the xor must be a
51249 // 'not' operation.
51250 SDValue Shift = N->getOperand(0);
51251 SDValue Ones = N->getOperand(1);
51252 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
51253 !ISD::isBuildVectorAllOnes(Ones.getNode()))
51254 return SDValue();
51255
51256 // The shift should be smearing the sign bit across each vector element.
51257 auto *ShiftAmt =
51258 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
51259 if (!ShiftAmt ||
51260 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
51261 return SDValue();
51262
51263 // Create a greater-than comparison against -1. We don't use the more obvious
51264 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
51265 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
51266}
51267
51268/// Detect patterns of truncation with unsigned saturation:
51269///
51270/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
51271/// Return the source value x to be truncated or SDValue() if the pattern was
51272/// not matched.
51273///
51274/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
51275/// where C1 >= 0 and C2 is unsigned max of destination type.
51276///
51277/// (truncate (smax (smin (x, C2), C1)) to dest_type)
51278/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
51279///
51280/// These two patterns are equivalent to:
51281/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
51282/// So return the smax(x, C1) value to be truncated or SDValue() if the
51283/// pattern was not matched.
51284static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51285 const SDLoc &DL) {
51286 EVT InVT = In.getValueType();
51287
51288 // Saturation with truncation. We truncate from InVT to VT.
51289 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__
__PRETTY_FUNCTION__))
51290 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__
__PRETTY_FUNCTION__))
;
51291
51292 // Match min/max and return limit value as a parameter.
51293 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
51294 if (V.getOpcode() == Opcode &&
51295 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
51296 return V.getOperand(0);
51297 return SDValue();
51298 };
51299
51300 APInt C1, C2;
51301 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
51302 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
51303 // the element size of the destination type.
51304 if (C2.isMask(VT.getScalarSizeInBits()))
51305 return UMin;
51306
51307 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
51308 if (MatchMinMax(SMin, ISD::SMAX, C1))
51309 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
51310 return SMin;
51311
51312 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
51313 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
51314 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
51315 C2.uge(C1)) {
51316 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
51317 }
51318
51319 return SDValue();
51320}
51321
51322/// Detect patterns of truncation with signed saturation:
51323/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
51324/// signed_max_of_dest_type)) to dest_type)
51325/// or:
51326/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
51327/// signed_min_of_dest_type)) to dest_type).
51328/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
51329/// Return the source value to be truncated or SDValue() if the pattern was not
51330/// matched.
51331static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
51332 unsigned NumDstBits = VT.getScalarSizeInBits();
51333 unsigned NumSrcBits = In.getScalarValueSizeInBits();
51334 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51334, __extension__
__PRETTY_FUNCTION__))
;
51335
51336 auto MatchMinMax = [](SDValue V, unsigned Opcode,
51337 const APInt &Limit) -> SDValue {
51338 APInt C;
51339 if (V.getOpcode() == Opcode &&
51340 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
51341 return V.getOperand(0);
51342 return SDValue();
51343 };
51344
51345 APInt SignedMax, SignedMin;
51346 if (MatchPackUS) {
51347 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
51348 SignedMin = APInt(NumSrcBits, 0);
51349 } else {
51350 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
51351 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
51352 }
51353
51354 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
51355 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
51356 return SMax;
51357
51358 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
51359 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
51360 return SMin;
51361
51362 return SDValue();
51363}
51364
51365static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
51366 SelectionDAG &DAG,
51367 const X86Subtarget &Subtarget) {
51368 if (!Subtarget.hasSSE2() || !VT.isVector())
51369 return SDValue();
51370
51371 EVT SVT = VT.getVectorElementType();
51372 EVT InVT = In.getValueType();
51373 EVT InSVT = InVT.getVectorElementType();
51374
51375 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
51376 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
51377 // and concatenate at the same time. Then we can use a final vpmovuswb to
51378 // clip to 0-255.
51379 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
51380 InVT == MVT::v16i32 && VT == MVT::v16i8) {
51381 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51382 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
51383 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
51384 DL, DAG, Subtarget);
51385 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51385, __extension__
__PRETTY_FUNCTION__))
;
51386 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
51387 }
51388 }
51389
51390 // vXi32 truncate instructions are available with AVX512F.
51391 // vXi16 truncate instructions are only available with AVX512BW.
51392 // For 256-bit or smaller vectors, we require VLX.
51393 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
51394 // If the result type is 256-bits or larger and we have disable 512-bit
51395 // registers, we should go ahead and use the pack instructions if possible.
51396 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
51397 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
51398 (InVT.getSizeInBits() > 128) &&
51399 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
51400 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
51401
51402 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
51403 VT.getSizeInBits() >= 64 &&
51404 (SVT == MVT::i8 || SVT == MVT::i16) &&
51405 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
51406 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51407 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
51408 // Only do this when the result is at least 64 bits or we'll leaving
51409 // dangling PACKSSDW nodes.
51410 if (SVT == MVT::i8 && InSVT == MVT::i32) {
51411 EVT MidVT = VT.changeVectorElementType(MVT::i16);
51412 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
51413 DAG, Subtarget);
51414 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51414, __extension__
__PRETTY_FUNCTION__))
;
51415 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
51416 Subtarget);
51417 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51417, __extension__ __PRETTY_FUNCTION__))
;
51418 return V;
51419 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
51420 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
51421 Subtarget);
51422 }
51423 if (SDValue SSatVal = detectSSatPattern(In, VT))
51424 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
51425 Subtarget);
51426 }
51427
51428 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51429 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
51430 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
51431 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
51432 unsigned TruncOpc = 0;
51433 SDValue SatVal;
51434 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
51435 SatVal = SSatVal;
51436 TruncOpc = X86ISD::VTRUNCS;
51437 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
51438 SatVal = USatVal;
51439 TruncOpc = X86ISD::VTRUNCUS;
51440 }
51441 if (SatVal) {
51442 unsigned ResElts = VT.getVectorNumElements();
51443 // If the input type is less than 512 bits and we don't have VLX, we need
51444 // to widen to 512 bits.
51445 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
51446 unsigned NumConcats = 512 / InVT.getSizeInBits();
51447 ResElts *= NumConcats;
51448 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
51449 ConcatOps[0] = SatVal;
51450 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
51451 NumConcats * InVT.getVectorNumElements());
51452 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
51453 }
51454 // Widen the result if its narrower than 128 bits.
51455 if (ResElts * SVT.getSizeInBits() < 128)
51456 ResElts = 128 / SVT.getSizeInBits();
51457 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
51458 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
51459 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51460 DAG.getIntPtrConstant(0, DL));
51461 }
51462 }
51463
51464 return SDValue();
51465}
51466
51467/// This function detects the AVG pattern between vectors of unsigned i8/i16,
51468/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
51469/// ISD::AVGCEILU (AVG) instruction.
51470static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51471 const X86Subtarget &Subtarget,
51472 const SDLoc &DL) {
51473 if (!VT.isVector())
51474 return SDValue();
51475 EVT InVT = In.getValueType();
51476 unsigned NumElems = VT.getVectorNumElements();
51477
51478 EVT ScalarVT = VT.getVectorElementType();
51479 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
51480 return SDValue();
51481
51482 // InScalarVT is the intermediate type in AVG pattern and it should be greater
51483 // than the original input type (i8/i16).
51484 EVT InScalarVT = InVT.getVectorElementType();
51485 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
51486 return SDValue();
51487
51488 if (!Subtarget.hasSSE2())
51489 return SDValue();
51490
51491 // Detect the following pattern:
51492 //
51493 // %1 = zext <N x i8> %a to <N x i32>
51494 // %2 = zext <N x i8> %b to <N x i32>
51495 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
51496 // %4 = add nuw nsw <N x i32> %3, %2
51497 // %5 = lshr <N x i32> %N, <i32 1 x N>
51498 // %6 = trunc <N x i32> %5 to <N x i8>
51499 //
51500 // In AVX512, the last instruction can also be a trunc store.
51501 if (In.getOpcode() != ISD::SRL)
51502 return SDValue();
51503
51504 // A lambda checking the given SDValue is a constant vector and each element
51505 // is in the range [Min, Max].
51506 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
51507 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
51508 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
51509 });
51510 };
51511
51512 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
51513 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
51514 return MaxActiveBits <= ScalarVT.getSizeInBits();
51515 };
51516
51517 // Check if each element of the vector is right-shifted by one.
51518 SDValue LHS = In.getOperand(0);
51519 SDValue RHS = In.getOperand(1);
51520 if (!IsConstVectorInRange(RHS, 1, 1))
51521 return SDValue();
51522 if (LHS.getOpcode() != ISD::ADD)
51523 return SDValue();
51524
51525 // Detect a pattern of a + b + 1 where the order doesn't matter.
51526 SDValue Operands[3];
51527 Operands[0] = LHS.getOperand(0);
51528 Operands[1] = LHS.getOperand(1);
51529
51530 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51531 ArrayRef<SDValue> Ops) {
51532 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
51533 };
51534
51535 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
51536 for (SDValue &Op : Ops)
51537 if (Op.getValueType() != VT)
51538 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
51539 // Pad to a power-of-2 vector, split+apply and extract the original vector.
51540 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
51541 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
51542 if (NumElemsPow2 != NumElems) {
51543 for (SDValue &Op : Ops) {
51544 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
51545 for (unsigned i = 0; i != NumElems; ++i) {
51546 SDValue Idx = DAG.getIntPtrConstant(i, DL);
51547 EltsOfOp[i] =
51548 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
51549 }
51550 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
51551 }
51552 }
51553 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
51554 if (NumElemsPow2 == NumElems)
51555 return Res;
51556 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51557 DAG.getIntPtrConstant(0, DL));
51558 };
51559
51560 // Take care of the case when one of the operands is a constant vector whose
51561 // element is in the range [1, 256].
51562 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
51563 IsZExtLike(Operands[0])) {
51564 // The pattern is detected. Subtract one from the constant vector, then
51565 // demote it and emit X86ISD::AVG instruction.
51566 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
51567 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
51568 return AVGSplitter({Operands[0], Operands[1]});
51569 }
51570
51571 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
51572 // Match the or case only if its 'add-like' - can be replaced by an add.
51573 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
51574 if (ISD::ADD == V.getOpcode()) {
51575 Op0 = V.getOperand(0);
51576 Op1 = V.getOperand(1);
51577 return true;
51578 }
51579 if (ISD::ZERO_EXTEND != V.getOpcode())
51580 return false;
51581 V = V.getOperand(0);
51582 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
51583 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
51584 return false;
51585 Op0 = V.getOperand(0);
51586 Op1 = V.getOperand(1);
51587 return true;
51588 };
51589
51590 SDValue Op0, Op1;
51591 if (FindAddLike(Operands[0], Op0, Op1))
51592 std::swap(Operands[0], Operands[1]);
51593 else if (!FindAddLike(Operands[1], Op0, Op1))
51594 return SDValue();
51595 Operands[2] = Op0;
51596 Operands[1] = Op1;
51597
51598 // Now we have three operands of two additions. Check that one of them is a
51599 // constant vector with ones, and the other two can be promoted from i8/i16.
51600 for (SDValue &Op : Operands) {
51601 if (!IsConstVectorInRange(Op, 1, 1))
51602 continue;
51603 std::swap(Op, Operands[2]);
51604
51605 // Check if Operands[0] and Operands[1] are results of type promotion.
51606 for (int j = 0; j < 2; ++j)
51607 if (Operands[j].getValueType() != VT)
51608 if (!IsZExtLike(Operands[j]))
51609 return SDValue();
51610
51611 // The pattern is detected, emit X86ISD::AVG instruction(s).
51612 return AVGSplitter({Operands[0], Operands[1]});
51613 }
51614
51615 return SDValue();
51616}
51617
51618static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
51619 TargetLowering::DAGCombinerInfo &DCI,
51620 const X86Subtarget &Subtarget) {
51621 LoadSDNode *Ld = cast<LoadSDNode>(N);
51622 EVT RegVT = Ld->getValueType(0);
51623 EVT MemVT = Ld->getMemoryVT();
51624 SDLoc dl(Ld);
51625 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51626
51627 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
51628 // into two 16-byte operations. Also split non-temporal aligned loads on
51629 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51630 ISD::LoadExtType Ext = Ld->getExtensionType();
51631 unsigned Fast;
51632 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
51633 Ext == ISD::NON_EXTLOAD &&
51634 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51635 Ld->getAlign() >= Align(16)) ||
51636 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
51637 *Ld->getMemOperand(), &Fast) &&
51638 !Fast))) {
51639 unsigned NumElems = RegVT.getVectorNumElements();
51640 if (NumElems < 2)
51641 return SDValue();
51642
51643 unsigned HalfOffset = 16;
51644 SDValue Ptr1 = Ld->getBasePtr();
51645 SDValue Ptr2 =
51646 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
51647 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
51648 NumElems / 2);
51649 SDValue Load1 =
51650 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51651 Ld->getOriginalAlign(),
51652 Ld->getMemOperand()->getFlags());
51653 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51654 Ld->getPointerInfo().getWithOffset(HalfOffset),
51655 Ld->getOriginalAlign(),
51656 Ld->getMemOperand()->getFlags());
51657 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
51658 Load1.getValue(1), Load2.getValue(1));
51659
51660 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
51661 return DCI.CombineTo(N, NewVec, TF, true);
51662 }
51663
51664 // Bool vector load - attempt to cast to an integer, as we have good
51665 // (vXiY *ext(vXi1 bitcast(iX))) handling.
51666 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
51667 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
51668 unsigned NumElts = RegVT.getVectorNumElements();
51669 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51670 if (TLI.isTypeLegal(IntVT)) {
51671 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
51672 Ld->getPointerInfo(),
51673 Ld->getOriginalAlign(),
51674 Ld->getMemOperand()->getFlags());
51675 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
51676 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
51677 }
51678 }
51679
51680 // If we also broadcast this as a subvector to a wider type, then just extract
51681 // the lowest subvector.
51682 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
51683 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
51684 SDValue Ptr = Ld->getBasePtr();
51685 SDValue Chain = Ld->getChain();
51686 for (SDNode *User : Ptr->uses()) {
51687 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51688 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51689 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51690 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51691 MemVT.getSizeInBits() &&
51692 !User->hasAnyUseOfValue(1) &&
51693 User->getValueSizeInBits(0).getFixedValue() >
51694 RegVT.getFixedSizeInBits()) {
51695 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51696 RegVT.getSizeInBits());
51697 Extract = DAG.getBitcast(RegVT, Extract);
51698 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51699 }
51700 }
51701 }
51702
51703 // Cast ptr32 and ptr64 pointers to the default address space before a load.
51704 unsigned AddrSpace = Ld->getAddressSpace();
51705 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51706 AddrSpace == X86AS::PTR32_UPTR) {
51707 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51708 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
51709 SDValue Cast =
51710 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
51711 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
51712 Ld->getOriginalAlign(),
51713 Ld->getMemOperand()->getFlags());
51714 }
51715 }
51716
51717 return SDValue();
51718}
51719
51720/// If V is a build vector of boolean constants and exactly one of those
51721/// constants is true, return the operand index of that true element.
51722/// Otherwise, return -1.
51723static int getOneTrueElt(SDValue V) {
51724 // This needs to be a build vector of booleans.
51725 // TODO: Checking for the i1 type matches the IR definition for the mask,
51726 // but the mask check could be loosened to i8 or other types. That might
51727 // also require checking more than 'allOnesValue'; eg, the x86 HW
51728 // instructions only require that the MSB is set for each mask element.
51729 // The ISD::MSTORE comments/definition do not specify how the mask operand
51730 // is formatted.
51731 auto *BV = dyn_cast<BuildVectorSDNode>(V);
51732 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
51733 return -1;
51734
51735 int TrueIndex = -1;
51736 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
51737 for (unsigned i = 0; i < NumElts; ++i) {
51738 const SDValue &Op = BV->getOperand(i);
51739 if (Op.isUndef())
51740 continue;
51741 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
51742 if (!ConstNode)
51743 return -1;
51744 if (ConstNode->getAPIntValue().countr_one() >= 1) {
51745 // If we already found a one, this is too many.
51746 if (TrueIndex >= 0)
51747 return -1;
51748 TrueIndex = i;
51749 }
51750 }
51751 return TrueIndex;
51752}
51753
51754/// Given a masked memory load/store operation, return true if it has one mask
51755/// bit set. If it has one mask bit set, then also return the memory address of
51756/// the scalar element to load/store, the vector index to insert/extract that
51757/// scalar element, and the alignment for the scalar memory access.
51758static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
51759 SelectionDAG &DAG, SDValue &Addr,
51760 SDValue &Index, Align &Alignment,
51761 unsigned &Offset) {
51762 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
51763 if (TrueMaskElt < 0)
51764 return false;
51765
51766 // Get the address of the one scalar element that is specified by the mask
51767 // using the appropriate offset from the base pointer.
51768 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
51769 Offset = 0;
51770 Addr = MaskedOp->getBasePtr();
51771 if (TrueMaskElt != 0) {
51772 Offset = TrueMaskElt * EltVT.getStoreSize();
51773 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
51774 SDLoc(MaskedOp));
51775 }
51776
51777 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
51778 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
51779 EltVT.getStoreSize());
51780 return true;
51781}
51782
51783/// If exactly one element of the mask is set for a non-extending masked load,
51784/// it is a scalar load and vector insert.
51785/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51786/// mask have already been optimized in IR, so we don't bother with those here.
51787static SDValue
51788reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51789 TargetLowering::DAGCombinerInfo &DCI,
51790 const X86Subtarget &Subtarget) {
51791 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51791, __extension__
__PRETTY_FUNCTION__))
;
51792 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51793 // However, some target hooks may need to be added to know when the transform
51794 // is profitable. Endianness would also have to be considered.
51795
51796 SDValue Addr, VecIndex;
51797 Align Alignment;
51798 unsigned Offset;
51799 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
51800 return SDValue();
51801
51802 // Load the one scalar element that is specified by the mask using the
51803 // appropriate offset from the base pointer.
51804 SDLoc DL(ML);
51805 EVT VT = ML->getValueType(0);
51806 EVT EltVT = VT.getVectorElementType();
51807
51808 EVT CastVT = VT;
51809 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51810 EltVT = MVT::f64;
51811 CastVT = VT.changeVectorElementType(EltVT);
51812 }
51813
51814 SDValue Load =
51815 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
51816 ML->getPointerInfo().getWithOffset(Offset),
51817 Alignment, ML->getMemOperand()->getFlags());
51818
51819 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
51820
51821 // Insert the loaded element into the appropriate place in the vector.
51822 SDValue Insert =
51823 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
51824 Insert = DAG.getBitcast(VT, Insert);
51825 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
51826}
51827
51828static SDValue
51829combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51830 TargetLowering::DAGCombinerInfo &DCI) {
51831 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51831, __extension__
__PRETTY_FUNCTION__))
;
51832 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
51833 return SDValue();
51834
51835 SDLoc DL(ML);
51836 EVT VT = ML->getValueType(0);
51837
51838 // If we are loading the first and last elements of a vector, it is safe and
51839 // always faster to load the whole vector. Replace the masked load with a
51840 // vector load and select.
51841 unsigned NumElts = VT.getVectorNumElements();
51842 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
51843 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
51844 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
51845 if (LoadFirstElt && LoadLastElt) {
51846 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
51847 ML->getMemOperand());
51848 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
51849 ML->getPassThru());
51850 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
51851 }
51852
51853 // Convert a masked load with a constant mask into a masked load and a select.
51854 // This allows the select operation to use a faster kind of select instruction
51855 // (for example, vblendvps -> vblendps).
51856
51857 // Don't try this if the pass-through operand is already undefined. That would
51858 // cause an infinite loop because that's what we're about to create.
51859 if (ML->getPassThru().isUndef())
51860 return SDValue();
51861
51862 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
51863 return SDValue();
51864
51865 // The new masked load has an undef pass-through operand. The select uses the
51866 // original pass-through operand.
51867 SDValue NewML = DAG.getMaskedLoad(
51868 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
51869 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
51870 ML->getAddressingMode(), ML->getExtensionType());
51871 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
51872 ML->getPassThru());
51873
51874 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
51875}
51876
51877static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
51878 TargetLowering::DAGCombinerInfo &DCI,
51879 const X86Subtarget &Subtarget) {
51880 auto *Mld = cast<MaskedLoadSDNode>(N);
51881
51882 // TODO: Expanding load with constant mask may be optimized as well.
51883 if (Mld->isExpandingLoad())
51884 return SDValue();
51885
51886 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
51887 if (SDValue ScalarLoad =
51888 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
51889 return ScalarLoad;
51890
51891 // TODO: Do some AVX512 subsets benefit from this transform?
51892 if (!Subtarget.hasAVX512())
51893 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
51894 return Blend;
51895 }
51896
51897 // If the mask value has been legalized to a non-boolean vector, try to
51898 // simplify ops leading up to it. We only demand the MSB of each lane.
51899 SDValue Mask = Mld->getMask();
51900 if (Mask.getScalarValueSizeInBits() != 1) {
51901 EVT VT = Mld->getValueType(0);
51902 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51903 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51904 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51905 if (N->getOpcode() != ISD::DELETED_NODE)
51906 DCI.AddToWorklist(N);
51907 return SDValue(N, 0);
51908 }
51909 if (SDValue NewMask =
51910 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51911 return DAG.getMaskedLoad(
51912 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
51913 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
51914 Mld->getAddressingMode(), Mld->getExtensionType());
51915 }
51916
51917 return SDValue();
51918}
51919
51920/// If exactly one element of the mask is set for a non-truncating masked store,
51921/// it is a vector extract and scalar store.
51922/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51923/// mask have already been optimized in IR, so we don't bother with those here.
51924static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
51925 SelectionDAG &DAG,
51926 const X86Subtarget &Subtarget) {
51927 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51928 // However, some target hooks may need to be added to know when the transform
51929 // is profitable. Endianness would also have to be considered.
51930
51931 SDValue Addr, VecIndex;
51932 Align Alignment;
51933 unsigned Offset;
51934 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
51935 return SDValue();
51936
51937 // Extract the one scalar element that is actually being stored.
51938 SDLoc DL(MS);
51939 SDValue Value = MS->getValue();
51940 EVT VT = Value.getValueType();
51941 EVT EltVT = VT.getVectorElementType();
51942 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51943 EltVT = MVT::f64;
51944 EVT CastVT = VT.changeVectorElementType(EltVT);
51945 Value = DAG.getBitcast(CastVT, Value);
51946 }
51947 SDValue Extract =
51948 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
51949
51950 // Store that element at the appropriate offset from the base pointer.
51951 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
51952 MS->getPointerInfo().getWithOffset(Offset),
51953 Alignment, MS->getMemOperand()->getFlags());
51954}
51955
51956static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
51957 TargetLowering::DAGCombinerInfo &DCI,
51958 const X86Subtarget &Subtarget) {
51959 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
51960 if (Mst->isCompressingStore())
51961 return SDValue();
51962
51963 EVT VT = Mst->getValue().getValueType();
51964 SDLoc dl(Mst);
51965 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51966
51967 if (Mst->isTruncatingStore())
51968 return SDValue();
51969
51970 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
51971 return ScalarStore;
51972
51973 // If the mask value has been legalized to a non-boolean vector, try to
51974 // simplify ops leading up to it. We only demand the MSB of each lane.
51975 SDValue Mask = Mst->getMask();
51976 if (Mask.getScalarValueSizeInBits() != 1) {
51977 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51978 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51979 if (N->getOpcode() != ISD::DELETED_NODE)
51980 DCI.AddToWorklist(N);
51981 return SDValue(N, 0);
51982 }
51983 if (SDValue NewMask =
51984 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51985 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51986 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51987 Mst->getMemoryVT(), Mst->getMemOperand(),
51988 Mst->getAddressingMode());
51989 }
51990
51991 SDValue Value = Mst->getValue();
51992 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51993 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
51994 Mst->getMemoryVT())) {
51995 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51996 Mst->getBasePtr(), Mst->getOffset(), Mask,
51997 Mst->getMemoryVT(), Mst->getMemOperand(),
51998 Mst->getAddressingMode(), true);
51999 }
52000
52001 return SDValue();
52002}
52003
52004static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
52005 TargetLowering::DAGCombinerInfo &DCI,
52006 const X86Subtarget &Subtarget) {
52007 StoreSDNode *St = cast<StoreSDNode>(N);
52008 EVT StVT = St->getMemoryVT();
52009 SDLoc dl(St);
52010 SDValue StoredVal = St->getValue();
52011 EVT VT = StoredVal.getValueType();
52012 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52013
52014 // Convert a store of vXi1 into a store of iX and a bitcast.
52015 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
52016 VT.getVectorElementType() == MVT::i1) {
52017
52018 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
52019 StoredVal = DAG.getBitcast(NewVT, StoredVal);
52020
52021 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52022 St->getPointerInfo(), St->getOriginalAlign(),
52023 St->getMemOperand()->getFlags());
52024 }
52025
52026 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
52027 // This will avoid a copy to k-register.
52028 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
52029 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
52030 StoredVal.getOperand(0).getValueType() == MVT::i8) {
52031 SDValue Val = StoredVal.getOperand(0);
52032 // We must store zeros to the unused bits.
52033 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
52034 return DAG.getStore(St->getChain(), dl, Val,
52035 St->getBasePtr(), St->getPointerInfo(),
52036 St->getOriginalAlign(),
52037 St->getMemOperand()->getFlags());
52038 }
52039
52040 // Widen v2i1/v4i1 stores to v8i1.
52041 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
52042 Subtarget.hasAVX512()) {
52043 unsigned NumConcats = 8 / VT.getVectorNumElements();
52044 // We must store zeros to the unused bits.
52045 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
52046 Ops[0] = StoredVal;
52047 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
52048 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52049 St->getPointerInfo(), St->getOriginalAlign(),
52050 St->getMemOperand()->getFlags());
52051 }
52052
52053 // Turn vXi1 stores of constants into a scalar store.
52054 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
52055 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
52056 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
52057 // If its a v64i1 store without 64-bit support, we need two stores.
52058 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
52059 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
52060 StoredVal->ops().slice(0, 32));
52061 Lo = combinevXi1ConstantToInteger(Lo, DAG);
52062 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
52063 StoredVal->ops().slice(32, 32));
52064 Hi = combinevXi1ConstantToInteger(Hi, DAG);
52065
52066 SDValue Ptr0 = St->getBasePtr();
52067 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
52068
52069 SDValue Ch0 =
52070 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52071 St->getOriginalAlign(),
52072 St->getMemOperand()->getFlags());
52073 SDValue Ch1 =
52074 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52075 St->getPointerInfo().getWithOffset(4),
52076 St->getOriginalAlign(),
52077 St->getMemOperand()->getFlags());
52078 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
52079 }
52080
52081 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
52082 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52083 St->getPointerInfo(), St->getOriginalAlign(),
52084 St->getMemOperand()->getFlags());
52085 }
52086
52087 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52088 // Sandy Bridge, perform two 16-byte stores.
52089 unsigned Fast;
52090 if (VT.is256BitVector() && StVT == VT &&
52091 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
52092 *St->getMemOperand(), &Fast) &&
52093 !Fast) {
52094 unsigned NumElems = VT.getVectorNumElements();
52095 if (NumElems < 2)
52096 return SDValue();
52097
52098 return splitVectorStore(St, DAG);
52099 }
52100
52101 // Split under-aligned vector non-temporal stores.
52102 if (St->isNonTemporal() && StVT == VT &&
52103 St->getAlign().value() < VT.getStoreSize()) {
52104 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52105 // vectors or the legalizer can scalarize it to use MOVNTI.
52106 if (VT.is256BitVector() || VT.is512BitVector()) {
52107 unsigned NumElems = VT.getVectorNumElements();
52108 if (NumElems < 2)
52109 return SDValue();
52110 return splitVectorStore(St, DAG);
52111 }
52112
52113 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52114 // to use MOVNTI.
52115 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
52116 MVT NTVT = Subtarget.hasSSE4A()
52117 ? MVT::v2f64
52118 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
52119 return scalarizeVectorStore(St, NTVT, DAG);
52120 }
52121 }
52122
52123 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52124 // supported, but avx512f is by extending to v16i32 and truncating.
52125 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52126 St->getValue().getOpcode() == ISD::TRUNCATE &&
52127 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
52128 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
52129 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
52130 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
52131 St->getValue().getOperand(0));
52132 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
52133 MVT::v16i8, St->getMemOperand());
52134 }
52135
52136 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
52137 if (!St->isTruncatingStore() &&
52138 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
52139 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
52140 StoredVal.hasOneUse() &&
52141 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
52142 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
52143 return EmitTruncSStore(IsSigned, St->getChain(),
52144 dl, StoredVal.getOperand(0), St->getBasePtr(),
52145 VT, St->getMemOperand(), DAG);
52146 }
52147
52148 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
52149 if (!St->isTruncatingStore()) {
52150 auto IsExtractedElement = [](SDValue V) {
52151 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
52152 V = V.getOperand(0);
52153 unsigned Opc = V.getOpcode();
52154 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
52155 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
52156 V.getOperand(0).hasOneUse())
52157 return V.getOperand(0);
52158 return SDValue();
52159 };
52160 if (SDValue Extract = IsExtractedElement(StoredVal)) {
52161 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
52162 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
52163 SDValue Src = Trunc.getOperand(0);
52164 MVT DstVT = Trunc.getSimpleValueType();
52165 MVT SrcVT = Src.getSimpleValueType();
52166 unsigned NumSrcElts = SrcVT.getVectorNumElements();
52167 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
52168 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
52169 if (NumTruncBits == VT.getSizeInBits() &&
52170 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
52171 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
52172 TruncVT, St->getMemOperand());
52173 }
52174 }
52175 }
52176 }
52177
52178 // Optimize trunc store (of multiple scalars) to shuffle and store.
52179 // First, pack all of the elements in one place. Next, store to memory
52180 // in fewer chunks.
52181 if (St->isTruncatingStore() && VT.isVector()) {
52182 // Check if we can detect an AVG pattern from the truncation. If yes,
52183 // replace the trunc store by a normal store with the result of X86ISD::AVG
52184 // instruction.
52185 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
52186 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
52187 Subtarget, dl))
52188 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
52189 St->getPointerInfo(), St->getOriginalAlign(),
52190 St->getMemOperand()->getFlags());
52191
52192 if (TLI.isTruncStoreLegal(VT, StVT)) {
52193 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
52194 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
52195 dl, Val, St->getBasePtr(),
52196 St->getMemoryVT(), St->getMemOperand(), DAG);
52197 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
52198 DAG, dl))
52199 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
52200 dl, Val, St->getBasePtr(),
52201 St->getMemoryVT(), St->getMemOperand(), DAG);
52202 }
52203
52204 return SDValue();
52205 }
52206
52207 // Cast ptr32 and ptr64 pointers to the default address space before a store.
52208 unsigned AddrSpace = St->getAddressSpace();
52209 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
52210 AddrSpace == X86AS::PTR32_UPTR) {
52211 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
52212 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
52213 SDValue Cast =
52214 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
52215 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
52216 St->getPointerInfo(), St->getOriginalAlign(),
52217 St->getMemOperand()->getFlags(), St->getAAInfo());
52218 }
52219 }
52220
52221 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
52222 // the FP state in cases where an emms may be missing.
52223 // A preferable solution to the general problem is to figure out the right
52224 // places to insert EMMS. This qualifies as a quick hack.
52225
52226 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
52227 if (VT.getSizeInBits() != 64)
52228 return SDValue();
52229
52230 const Function &F = DAG.getMachineFunction().getFunction();
52231 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
52232 bool F64IsLegal =
52233 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
52234 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
52235 isa<LoadSDNode>(St->getValue()) &&
52236 cast<LoadSDNode>(St->getValue())->isSimple() &&
52237 St->getChain().hasOneUse() && St->isSimple()) {
52238 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
52239
52240 if (!ISD::isNormalLoad(Ld))
52241 return SDValue();
52242
52243 // Avoid the transformation if there are multiple uses of the loaded value.
52244 if (!Ld->hasNUsesOfValue(1, 0))
52245 return SDValue();
52246
52247 SDLoc LdDL(Ld);
52248 SDLoc StDL(N);
52249 // Lower to a single movq load/store pair.
52250 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
52251 Ld->getBasePtr(), Ld->getMemOperand());
52252
52253 // Make sure new load is placed in same chain order.
52254 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
52255 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
52256 St->getMemOperand());
52257 }
52258
52259 // This is similar to the above case, but here we handle a scalar 64-bit
52260 // integer store that is extracted from a vector on a 32-bit target.
52261 // If we have SSE2, then we can treat it like a floating-point double
52262 // to get past legalization. The execution dependencies fixup pass will
52263 // choose the optimal machine instruction for the store if this really is
52264 // an integer or v2f32 rather than an f64.
52265 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
52266 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
52267 SDValue OldExtract = St->getOperand(1);
52268 SDValue ExtOp0 = OldExtract.getOperand(0);
52269 unsigned VecSize = ExtOp0.getValueSizeInBits();
52270 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
52271 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
52272 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
52273 BitCast, OldExtract.getOperand(1));
52274 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
52275 St->getPointerInfo(), St->getOriginalAlign(),
52276 St->getMemOperand()->getFlags());
52277 }
52278
52279 return SDValue();
52280}
52281
52282static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
52283 TargetLowering::DAGCombinerInfo &DCI,
52284 const X86Subtarget &Subtarget) {
52285 auto *St = cast<MemIntrinsicSDNode>(N);
52286
52287 SDValue StoredVal = N->getOperand(1);
52288 MVT VT = StoredVal.getSimpleValueType();
52289 EVT MemVT = St->getMemoryVT();
52290
52291 // Figure out which elements we demand.
52292 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
52293 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
52294
52295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52296 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
52297 if (N->getOpcode() != ISD::DELETED_NODE)
52298 DCI.AddToWorklist(N);
52299 return SDValue(N, 0);
52300 }
52301
52302 return SDValue();
52303}
52304
52305/// Return 'true' if this vector operation is "horizontal"
52306/// and return the operands for the horizontal operation in LHS and RHS. A
52307/// horizontal operation performs the binary operation on successive elements
52308/// of its first operand, then on successive elements of its second operand,
52309/// returning the resulting values in a vector. For example, if
52310/// A = < float a0, float a1, float a2, float a3 >
52311/// and
52312/// B = < float b0, float b1, float b2, float b3 >
52313/// then the result of doing a horizontal operation on A and B is
52314/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
52315/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
52316/// A horizontal-op B, for some already available A and B, and if so then LHS is
52317/// set to A, RHS to B, and the routine returns 'true'.
52318static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
52319 SelectionDAG &DAG, const X86Subtarget &Subtarget,
52320 bool IsCommutative,
52321 SmallVectorImpl<int> &PostShuffleMask) {
52322 // If either operand is undef, bail out. The binop should be simplified.
52323 if (LHS.isUndef() || RHS.isUndef())
52324 return false;
52325
52326 // Look for the following pattern:
52327 // A = < float a0, float a1, float a2, float a3 >
52328 // B = < float b0, float b1, float b2, float b3 >
52329 // and
52330 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
52331 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
52332 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
52333 // which is A horizontal-op B.
52334
52335 MVT VT = LHS.getSimpleValueType();
52336 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__
__PRETTY_FUNCTION__))
52337 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__
__PRETTY_FUNCTION__))
;
52338 unsigned NumElts = VT.getVectorNumElements();
52339
52340 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
52341 SmallVectorImpl<int> &ShuffleMask) {
52342 bool UseSubVector = false;
52343 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
52344 Op.getOperand(0).getValueType().is256BitVector() &&
52345 llvm::isNullConstant(Op.getOperand(1))) {
52346 Op = Op.getOperand(0);
52347 UseSubVector = true;
52348 }
52349 SmallVector<SDValue, 2> SrcOps;
52350 SmallVector<int, 16> SrcMask, ScaledMask;
52351 SDValue BC = peekThroughBitcasts(Op);
52352 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
52353 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
52354 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
52355 })) {
52356 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
52357 if (!UseSubVector && SrcOps.size() <= 2 &&
52358 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
52359 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
52360 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
52361 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
52362 }
52363 if (UseSubVector && SrcOps.size() == 1 &&
52364 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
52365 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
52366 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
52367 ShuffleMask.assign(Mask.begin(), Mask.end());
52368 }
52369 }
52370 };
52371
52372 // View LHS in the form
52373 // LHS = VECTOR_SHUFFLE A, B, LMask
52374 // If LHS is not a shuffle, then pretend it is the identity shuffle:
52375 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
52376 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
52377 SDValue A, B;
52378 SmallVector<int, 16> LMask;
52379 GetShuffle(LHS, A, B, LMask);
52380
52381 // Likewise, view RHS in the form
52382 // RHS = VECTOR_SHUFFLE C, D, RMask
52383 SDValue C, D;
52384 SmallVector<int, 16> RMask;
52385 GetShuffle(RHS, C, D, RMask);
52386
52387 // At least one of the operands should be a vector shuffle.
52388 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
52389 if (NumShuffles == 0)
52390 return false;
52391
52392 if (LMask.empty()) {
52393 A = LHS;
52394 for (unsigned i = 0; i != NumElts; ++i)
52395 LMask.push_back(i);
52396 }
52397
52398 if (RMask.empty()) {
52399 C = RHS;
52400 for (unsigned i = 0; i != NumElts; ++i)
52401 RMask.push_back(i);
52402 }
52403
52404 // If we have an unary mask, ensure the other op is set to null.
52405 if (isUndefOrInRange(LMask, 0, NumElts))
52406 B = SDValue();
52407 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
52408 A = SDValue();
52409
52410 if (isUndefOrInRange(RMask, 0, NumElts))
52411 D = SDValue();
52412 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
52413 C = SDValue();
52414
52415 // If A and B occur in reverse order in RHS, then canonicalize by commuting
52416 // RHS operands and shuffle mask.
52417 if (A != C) {
52418 std::swap(C, D);
52419 ShuffleVectorSDNode::commuteMask(RMask);
52420 }
52421 // Check that the shuffles are both shuffling the same vectors.
52422 if (!(A == C && B == D))
52423 return false;
52424
52425 PostShuffleMask.clear();
52426 PostShuffleMask.append(NumElts, SM_SentinelUndef);
52427
52428 // LHS and RHS are now:
52429 // LHS = shuffle A, B, LMask
52430 // RHS = shuffle A, B, RMask
52431 // Check that the masks correspond to performing a horizontal operation.
52432 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
52433 // so we just repeat the inner loop if this is a 256-bit op.
52434 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
52435 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
52436 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
52437 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__
__PRETTY_FUNCTION__))
52438 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__
__PRETTY_FUNCTION__))
;
52439 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
52440 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
52441 // Ignore undefined components.
52442 int LIdx = LMask[i + j], RIdx = RMask[i + j];
52443 if (LIdx < 0 || RIdx < 0 ||
52444 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
52445 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
52446 continue;
52447
52448 // Check that successive odd/even elements are being operated on. If not,
52449 // this is not a horizontal operation.
52450 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
52451 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
52452 return false;
52453
52454 // Compute the post-shuffle mask index based on where the element
52455 // is stored in the HOP result, and where it needs to be moved to.
52456 int Base = LIdx & ~1u;
52457 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
52458 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
52459
52460 // The low half of the 128-bit result must choose from A.
52461 // The high half of the 128-bit result must choose from B,
52462 // unless B is undef. In that case, we are always choosing from A.
52463 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
52464 Index += NumEltsPer64BitChunk;
52465 PostShuffleMask[i + j] = Index;
52466 }
52467 }
52468
52469 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
52470 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
52471
52472 bool IsIdentityPostShuffle =
52473 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
52474 if (IsIdentityPostShuffle)
52475 PostShuffleMask.clear();
52476
52477 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
52478 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
52479 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
52480 return false;
52481
52482 // If the source nodes are already used in HorizOps then always accept this.
52483 // Shuffle folding should merge these back together.
52484 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
52485 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52486 });
52487 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
52488 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52489 });
52490 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
52491
52492 // Assume a SingleSource HOP if we only shuffle one input and don't need to
52493 // shuffle the result.
52494 if (!ForceHorizOp &&
52495 !shouldUseHorizontalOp(NewLHS == NewRHS &&
52496 (NumShuffles < 2 || !IsIdentityPostShuffle),
52497 DAG, Subtarget))
52498 return false;
52499
52500 LHS = DAG.getBitcast(VT, NewLHS);
52501 RHS = DAG.getBitcast(VT, NewRHS);
52502 return true;
52503}
52504
52505// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
52506static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
52507 const X86Subtarget &Subtarget) {
52508 EVT VT = N->getValueType(0);
52509 unsigned Opcode = N->getOpcode();
52510 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
52511 SmallVector<int, 8> PostShuffleMask;
52512
52513 switch (Opcode) {
52514 case ISD::FADD:
52515 case ISD::FSUB:
52516 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
52517 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
52518 SDValue LHS = N->getOperand(0);
52519 SDValue RHS = N->getOperand(1);
52520 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
52521 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52522 PostShuffleMask)) {
52523 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
52524 if (!PostShuffleMask.empty())
52525 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52526 DAG.getUNDEF(VT), PostShuffleMask);
52527 return HorizBinOp;
52528 }
52529 }
52530 break;
52531 case ISD::ADD:
52532 case ISD::SUB:
52533 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
52534 VT == MVT::v16i16 || VT == MVT::v8i32)) {
52535 SDValue LHS = N->getOperand(0);
52536 SDValue RHS = N->getOperand(1);
52537 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
52538 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52539 PostShuffleMask)) {
52540 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
52541 ArrayRef<SDValue> Ops) {
52542 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
52543 };
52544 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
52545 {LHS, RHS}, HOpBuilder);
52546 if (!PostShuffleMask.empty())
52547 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52548 DAG.getUNDEF(VT), PostShuffleMask);
52549 return HorizBinOp;
52550 }
52551 }
52552 break;
52553 }
52554
52555 return SDValue();
52556}
52557
52558// Try to combine the following nodes
52559// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
52560// <i32 -2147483648[float -0.000000e+00]> 0
52561// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
52562// <(load 4 from constant-pool)> t0, t29
52563// [t30: v16i32 = bitcast t27]
52564// t6: v16i32 = xor t7, t27[t30]
52565// t11: v16f32 = bitcast t6
52566// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
52567// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
52568// t22: v16f32 = bitcast t7
52569// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
52570// t24: v32f16 = bitcast t23
52571static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
52572 const X86Subtarget &Subtarget) {
52573 EVT VT = N->getValueType(0);
52574 SDValue LHS = N->getOperand(0);
52575 SDValue RHS = N->getOperand(1);
52576 int CombineOpcode =
52577 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
52578 auto isConjugationConstant = [](const Constant *c) {
52579 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
52580 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
52581 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
52582 switch (CI->getBitWidth()) {
52583 case 16:
52584 return false;
52585 case 32:
52586 return CI->getValue() == ConjugationInt32;
52587 case 64:
52588 return CI->getValue() == ConjugationInt64;
52589 default:
52590 llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52590)
;
52591 }
52592 }
52593 if (const auto *CF = dyn_cast<ConstantFP>(c))
52594 return CF->isNegativeZeroValue();
52595 return false;
52596 };
52597 auto combineConjugation = [&](SDValue &r) {
52598 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
52599 SDValue XOR = LHS.getOperand(0);
52600 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
52601 SDValue XORRHS = XOR.getOperand(1);
52602 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
52603 XORRHS = XORRHS.getOperand(0);
52604 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
52605 XORRHS.getOperand(1).getNumOperands()) {
52606 ConstantPoolSDNode *CP =
52607 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
52608 if (CP && isConjugationConstant(CP->getConstVal())) {
52609 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
52610 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
52611 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
52612 r = DAG.getBitcast(VT, FCMulC);
52613 return true;
52614 }
52615 }
52616 }
52617 }
52618 return false;
52619 };
52620 SDValue Res;
52621 if (combineConjugation(Res))
52622 return Res;
52623 std::swap(LHS, RHS);
52624 if (combineConjugation(Res))
52625 return Res;
52626 return Res;
52627}
52628
52629// Try to combine the following nodes:
52630// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
52631static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
52632 const X86Subtarget &Subtarget) {
52633 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
52634 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
52635 Flags.hasAllowContract();
52636 };
52637
52638 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
52639 return DAG.getTarget().Options.NoSignedZerosFPMath ||
52640 Flags.hasNoSignedZeros();
52641 };
52642 auto IsVectorAllNegativeZero = [](const SDNode *N) {
52643 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
52644 return false;
52645 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__
__PRETTY_FUNCTION__))
52646 "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__
__PRETTY_FUNCTION__))
;
52647 if (ConstantPoolSDNode *CP =
52648 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
52649 APInt AI = APInt(32, 0x80008000, true);
52650 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
52651 return CI->getValue() == AI;
52652 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
52653 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
52654 }
52655 return false;
52656 };
52657
52658 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
52659 !AllowContract(N->getFlags()))
52660 return SDValue();
52661
52662 EVT VT = N->getValueType(0);
52663 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
52664 return SDValue();
52665
52666 SDValue LHS = N->getOperand(0);
52667 SDValue RHS = N->getOperand(1);
52668 bool IsConj;
52669 SDValue FAddOp1, MulOp0, MulOp1;
52670 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
52671 &IsVectorAllNegativeZero,
52672 &HasNoSignedZero](SDValue N) -> bool {
52673 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
52674 return false;
52675 SDValue Op0 = N.getOperand(0);
52676 unsigned Opcode = Op0.getOpcode();
52677 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
52678 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
52679 MulOp0 = Op0.getOperand(0);
52680 MulOp1 = Op0.getOperand(1);
52681 IsConj = Opcode == X86ISD::VFCMULC;
52682 return true;
52683 }
52684 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
52685 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
52686 HasNoSignedZero(Op0->getFlags())) ||
52687 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
52688 MulOp0 = Op0.getOperand(0);
52689 MulOp1 = Op0.getOperand(1);
52690 IsConj = Opcode == X86ISD::VFCMADDC;
52691 return true;
52692 }
52693 }
52694 return false;
52695 };
52696
52697 if (GetCFmulFrom(LHS))
52698 FAddOp1 = RHS;
52699 else if (GetCFmulFrom(RHS))
52700 FAddOp1 = LHS;
52701 else
52702 return SDValue();
52703
52704 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
52705 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
52706 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
52707 // FIXME: How do we handle when fast math flags of FADD are different from
52708 // CFMUL's?
52709 SDValue CFmul =
52710 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
52711 return DAG.getBitcast(VT, CFmul);
52712}
52713
52714/// Do target-specific dag combines on floating-point adds/subs.
52715static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
52716 const X86Subtarget &Subtarget) {
52717 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
52718 return HOp;
52719
52720 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
52721 return COp;
52722
52723 return SDValue();
52724}
52725
52726/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
52727/// the codegen.
52728/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
52729/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
52730/// anything that is guaranteed to be transformed by DAGCombiner.
52731static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
52732 const X86Subtarget &Subtarget,
52733 const SDLoc &DL) {
52734 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52734, __extension__
__PRETTY_FUNCTION__))
;
52735 SDValue Src = N->getOperand(0);
52736 unsigned SrcOpcode = Src.getOpcode();
52737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52738
52739 EVT VT = N->getValueType(0);
52740 EVT SrcVT = Src.getValueType();
52741
52742 auto IsFreeTruncation = [VT](SDValue Op) {
52743 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
52744
52745 // See if this has been extended from a smaller/equal size to
52746 // the truncation size, allowing a truncation to combine with the extend.
52747 unsigned Opcode = Op.getOpcode();
52748 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
52749 Opcode == ISD::ZERO_EXTEND) &&
52750 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
52751 return true;
52752
52753 // See if this is a single use constant which can be constant folded.
52754 // NOTE: We don't peek throught bitcasts here because there is currently
52755 // no support for constant folding truncate+bitcast+vector_of_constants. So
52756 // we'll just send up with a truncate on both operands which will
52757 // get turned back into (truncate (binop)) causing an infinite loop.
52758 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
52759 };
52760
52761 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
52762 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
52763 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
52764 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
52765 };
52766
52767 // Don't combine if the operation has other uses.
52768 if (!Src.hasOneUse())
52769 return SDValue();
52770
52771 // Only support vector truncation for now.
52772 // TODO: i64 scalar math would benefit as well.
52773 if (!VT.isVector())
52774 return SDValue();
52775
52776 // In most cases its only worth pre-truncating if we're only facing the cost
52777 // of one truncation.
52778 // i.e. if one of the inputs will constant fold or the input is repeated.
52779 switch (SrcOpcode) {
52780 case ISD::MUL:
52781 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
52782 // better to truncate if we have the chance.
52783 if (SrcVT.getScalarType() == MVT::i64 &&
52784 TLI.isOperationLegal(SrcOpcode, VT) &&
52785 !TLI.isOperationLegal(SrcOpcode, SrcVT))
52786 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
52787 [[fallthrough]];
52788 case ISD::AND:
52789 case ISD::XOR:
52790 case ISD::OR:
52791 case ISD::ADD:
52792 case ISD::SUB: {
52793 SDValue Op0 = Src.getOperand(0);
52794 SDValue Op1 = Src.getOperand(1);
52795 if (TLI.isOperationLegal(SrcOpcode, VT) &&
52796 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
52797 return TruncateArithmetic(Op0, Op1);
52798 break;
52799 }
52800 }
52801
52802 return SDValue();
52803}
52804
52805/// Truncate using ISD::AND mask and X86ISD::PACKUS.
52806/// e.g. trunc <8 x i32> X to <8 x i16> -->
52807/// MaskX = X & 0xffff (clear high bits to prevent saturation)
52808/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
52809static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
52810 const X86Subtarget &Subtarget,
52811 SelectionDAG &DAG) {
52812 SDValue In = N->getOperand(0);
52813 EVT InVT = In.getValueType();
52814 EVT OutVT = N->getValueType(0);
52815
52816 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
52817 OutVT.getScalarSizeInBits());
52818 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
52819 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
52820}
52821
52822/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
52823static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
52824 const X86Subtarget &Subtarget,
52825 SelectionDAG &DAG) {
52826 SDValue In = N->getOperand(0);
52827 EVT InVT = In.getValueType();
52828 EVT OutVT = N->getValueType(0);
52829 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
52830 DAG.getValueType(OutVT));
52831 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
52832}
52833
52834/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
52835/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
52836/// legalization the truncation will be translated into a BUILD_VECTOR with each
52837/// element that is extracted from a vector and then truncated, and it is
52838/// difficult to do this optimization based on them.
52839static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
52840 const X86Subtarget &Subtarget) {
52841 EVT OutVT = N->getValueType(0);
52842 if (!OutVT.isVector())
52843 return SDValue();
52844
52845 SDValue In = N->getOperand(0);
52846 if (!In.getValueType().isSimple())
52847 return SDValue();
52848
52849 EVT InVT = In.getValueType();
52850 unsigned NumElems = OutVT.getVectorNumElements();
52851
52852 // AVX512 provides fast truncate ops.
52853 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
52854 return SDValue();
52855
52856 EVT OutSVT = OutVT.getVectorElementType();
52857 EVT InSVT = InVT.getVectorElementType();
52858 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
52859 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
52860 NumElems >= 8))
52861 return SDValue();
52862
52863 // SSSE3's pshufb results in less instructions in the cases below.
52864 if (Subtarget.hasSSSE3() && NumElems == 8) {
52865 if (InSVT == MVT::i16)
52866 return SDValue();
52867 if (InSVT == MVT::i32 &&
52868 (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
52869 return SDValue();
52870 }
52871
52872 SDLoc DL(N);
52873 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
52874 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
52875 // truncate 2 x v4i32 to v8i16.
52876 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
52877 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
52878 if (InSVT == MVT::i32)
52879 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
52880
52881 return SDValue();
52882}
52883
52884/// This function transforms vector truncation of 'extended sign-bits' or
52885/// 'extended zero-bits' values.
52886/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
52887static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
52888 SelectionDAG &DAG,
52889 const X86Subtarget &Subtarget) {
52890 // Requires SSE2.
52891 if (!Subtarget.hasSSE2())
52892 return SDValue();
52893
52894 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
52895 return SDValue();
52896
52897 SDValue In = N->getOperand(0);
52898 if (!In.getValueType().isSimple())
52899 return SDValue();
52900
52901 MVT VT = N->getValueType(0).getSimpleVT();
52902 MVT SVT = VT.getScalarType();
52903
52904 MVT InVT = In.getValueType().getSimpleVT();
52905 MVT InSVT = InVT.getScalarType();
52906
52907 // Check we have a truncation suited for PACKSS/PACKUS.
52908 if (!isPowerOf2_32(VT.getVectorNumElements()))
52909 return SDValue();
52910 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
52911 return SDValue();
52912 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
52913 return SDValue();
52914
52915 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
52916 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
52917 return SDValue();
52918
52919 // AVX512 has fast truncate, but if the input is already going to be split,
52920 // there's no harm in trying pack.
52921 if (Subtarget.hasAVX512() &&
52922 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
52923 InVT.is512BitVector())) {
52924 // PACK should still be worth it for 128-bit vectors if the sources were
52925 // originally concatenated from subvectors.
52926 SmallVector<SDValue> ConcatOps;
52927 if (VT.getSizeInBits() > 128 ||
52928 !collectConcatOps(In.getNode(), ConcatOps, DAG))
52929 return SDValue();
52930 }
52931
52932 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
52933 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
52934
52935 // Use PACKUS if the input has zero-bits that extend all the way to the
52936 // packed/truncated value. e.g. masks, zext_in_reg, etc.
52937 KnownBits Known = DAG.computeKnownBits(In);
52938 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
52939 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
52940 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
52941
52942 // Use PACKSS if the input has sign-bits that extend all the way to the
52943 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
52944 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
52945
52946 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
52947 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
52948 // on and combines/simplifications can't then use it.
52949 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
52950 return SDValue();
52951
52952 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
52953 if (NumSignBits > MinSignBits)
52954 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
52955
52956 // If we have a srl that only generates signbits that we will discard in
52957 // the truncation then we can use PACKSS by converting the srl to a sra.
52958 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
52959 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
52960 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
52961 In, APInt::getAllOnes(VT.getVectorNumElements()))) {
52962 if (*ShAmt == MinSignBits) {
52963 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
52964 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
52965 Subtarget);
52966 }
52967 }
52968
52969 return SDValue();
52970}
52971
52972// Try to form a MULHU or MULHS node by looking for
52973// (trunc (srl (mul ext, ext), 16))
52974// TODO: This is X86 specific because we want to be able to handle wide types
52975// before type legalization. But we can only do it if the vector will be
52976// legalized via widening/splitting. Type legalization can't handle promotion
52977// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
52978// combiner.
52979static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
52980 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
52981 // First instruction should be a right shift of a multiply.
52982 if (Src.getOpcode() != ISD::SRL ||
52983 Src.getOperand(0).getOpcode() != ISD::MUL)
52984 return SDValue();
52985
52986 if (!Subtarget.hasSSE2())
52987 return SDValue();
52988
52989 // Only handle vXi16 types that are at least 128-bits unless they will be
52990 // widened.
52991 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
52992 return SDValue();
52993
52994 // Input type should be at least vXi32.
52995 EVT InVT = Src.getValueType();
52996 if (InVT.getVectorElementType().getSizeInBits() < 32)
52997 return SDValue();
52998
52999 // Need a shift by 16.
53000 APInt ShiftAmt;
53001 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
53002 ShiftAmt != 16)
53003 return SDValue();
53004
53005 SDValue LHS = Src.getOperand(0).getOperand(0);
53006 SDValue RHS = Src.getOperand(0).getOperand(1);
53007
53008 // Count leading sign/zero bits on both inputs - if there are enough then
53009 // truncation back to vXi16 will be cheap - either as a pack/shuffle
53010 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
53011 // truncations may actually be free by peeking through to the ext source.
53012 auto IsSext = [&DAG](SDValue V) {
53013 return DAG.ComputeMaxSignificantBits(V) <= 16;
53014 };
53015 auto IsZext = [&DAG](SDValue V) {
53016 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
53017 };
53018
53019 bool IsSigned = IsSext(LHS) && IsSext(RHS);
53020 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
53021 if (!IsSigned && !IsUnsigned)
53022 return SDValue();
53023
53024 // Check if both inputs are extensions, which will be removed by truncation.
53025 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
53026 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
53027 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
53028 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
53029 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
53030 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
53031
53032 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
53033 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
53034 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
53035 // will have to split anyway.
53036 unsigned InSizeInBits = InVT.getSizeInBits();
53037 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
53038 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
53039 (InSizeInBits % 16) == 0) {
53040 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53041 InVT.getSizeInBits() / 16);
53042 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
53043 DAG.getBitcast(BCVT, RHS));
53044 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
53045 }
53046
53047 // Truncate back to source type.
53048 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
53049 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
53050
53051 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
53052 return DAG.getNode(Opc, DL, VT, LHS, RHS);
53053}
53054
53055// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
53056// from one vector with signed bytes from another vector, adds together
53057// adjacent pairs of 16-bit products, and saturates the result before
53058// truncating to 16-bits.
53059//
53060// Which looks something like this:
53061// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
53062// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
53063static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
53064 const X86Subtarget &Subtarget,
53065 const SDLoc &DL) {
53066 if (!VT.isVector() || !Subtarget.hasSSSE3())
53067 return SDValue();
53068
53069 unsigned NumElems = VT.getVectorNumElements();
53070 EVT ScalarVT = VT.getVectorElementType();
53071 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
53072 return SDValue();
53073
53074 SDValue SSatVal = detectSSatPattern(In, VT);
53075 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
53076 return SDValue();
53077
53078 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
53079 // of multiplies from even/odd elements.
53080 SDValue N0 = SSatVal.getOperand(0);
53081 SDValue N1 = SSatVal.getOperand(1);
53082
53083 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53084 return SDValue();
53085
53086 SDValue N00 = N0.getOperand(0);
53087 SDValue N01 = N0.getOperand(1);
53088 SDValue N10 = N1.getOperand(0);
53089 SDValue N11 = N1.getOperand(1);
53090
53091 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
53092 // Canonicalize zero_extend to LHS.
53093 if (N01.getOpcode() == ISD::ZERO_EXTEND)
53094 std::swap(N00, N01);
53095 if (N11.getOpcode() == ISD::ZERO_EXTEND)
53096 std::swap(N10, N11);
53097
53098 // Ensure we have a zero_extend and a sign_extend.
53099 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
53100 N01.getOpcode() != ISD::SIGN_EXTEND ||
53101 N10.getOpcode() != ISD::ZERO_EXTEND ||
53102 N11.getOpcode() != ISD::SIGN_EXTEND)
53103 return SDValue();
53104
53105 // Peek through the extends.
53106 N00 = N00.getOperand(0);
53107 N01 = N01.getOperand(0);
53108 N10 = N10.getOperand(0);
53109 N11 = N11.getOperand(0);
53110
53111 // Ensure the extend is from vXi8.
53112 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
53113 N01.getValueType().getVectorElementType() != MVT::i8 ||
53114 N10.getValueType().getVectorElementType() != MVT::i8 ||
53115 N11.getValueType().getVectorElementType() != MVT::i8)
53116 return SDValue();
53117
53118 // All inputs should be build_vectors.
53119 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53120 N01.getOpcode() != ISD::BUILD_VECTOR ||
53121 N10.getOpcode() != ISD::BUILD_VECTOR ||
53122 N11.getOpcode() != ISD::BUILD_VECTOR)
53123 return SDValue();
53124
53125 // N00/N10 are zero extended. N01/N11 are sign extended.
53126
53127 // For each element, we need to ensure we have an odd element from one vector
53128 // multiplied by the odd element of another vector and the even element from
53129 // one of the same vectors being multiplied by the even element from the
53130 // other vector. So we need to make sure for each element i, this operator
53131 // is being performed:
53132 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53133 SDValue ZExtIn, SExtIn;
53134 for (unsigned i = 0; i != NumElems; ++i) {
53135 SDValue N00Elt = N00.getOperand(i);
53136 SDValue N01Elt = N01.getOperand(i);
53137 SDValue N10Elt = N10.getOperand(i);
53138 SDValue N11Elt = N11.getOperand(i);
53139 // TODO: Be more tolerant to undefs.
53140 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53141 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53142 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53143 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
53144 return SDValue();
53145 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53146 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53147 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53148 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53149 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53150 return SDValue();
53151 unsigned IdxN00 = ConstN00Elt->getZExtValue();
53152 unsigned IdxN01 = ConstN01Elt->getZExtValue();
53153 unsigned IdxN10 = ConstN10Elt->getZExtValue();
53154 unsigned IdxN11 = ConstN11Elt->getZExtValue();
53155 // Add is commutative so indices can be reordered.
53156 if (IdxN00 > IdxN10) {
53157 std::swap(IdxN00, IdxN10);
53158 std::swap(IdxN01, IdxN11);
53159 }
53160 // N0 indices be the even element. N1 indices must be the next odd element.
53161 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53162 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53163 return SDValue();
53164 SDValue N00In = N00Elt.getOperand(0);
53165 SDValue N01In = N01Elt.getOperand(0);
53166 SDValue N10In = N10Elt.getOperand(0);
53167 SDValue N11In = N11Elt.getOperand(0);
53168 // First time we find an input capture it.
53169 if (!ZExtIn) {
53170 ZExtIn = N00In;
53171 SExtIn = N01In;
53172 }
53173 if (ZExtIn != N00In || SExtIn != N01In ||
53174 ZExtIn != N10In || SExtIn != N11In)
53175 return SDValue();
53176 }
53177
53178 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53179 ArrayRef<SDValue> Ops) {
53180 // Shrink by adding truncate nodes and let DAGCombine fold with the
53181 // sources.
53182 EVT InVT = Ops[0].getValueType();
53183 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__
__PRETTY_FUNCTION__))
53184 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__
__PRETTY_FUNCTION__))
;
53185 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53185, __extension__
__PRETTY_FUNCTION__))
;
53186 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53187 InVT.getVectorNumElements() / 2);
53188 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
53189 };
53190 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
53191 PMADDBuilder);
53192}
53193
53194static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
53195 const X86Subtarget &Subtarget) {
53196 EVT VT = N->getValueType(0);
53197 SDValue Src = N->getOperand(0);
53198 SDLoc DL(N);
53199
53200 // Attempt to pre-truncate inputs to arithmetic ops instead.
53201 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
53202 return V;
53203
53204 // Try to detect AVG pattern first.
53205 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
53206 return Avg;
53207
53208 // Try to detect PMADD
53209 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
53210 return PMAdd;
53211
53212 // Try to combine truncation with signed/unsigned saturation.
53213 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
53214 return Val;
53215
53216 // Try to combine PMULHUW/PMULHW for vXi16.
53217 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
53218 return V;
53219
53220 // The bitcast source is a direct mmx result.
53221 // Detect bitcasts between i32 to x86mmx
53222 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
53223 SDValue BCSrc = Src.getOperand(0);
53224 if (BCSrc.getValueType() == MVT::x86mmx)
53225 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
53226 }
53227
53228 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
53229 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
53230 return V;
53231
53232 return combineVectorTruncation(N, DAG, Subtarget);
53233}
53234
53235static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
53236 TargetLowering::DAGCombinerInfo &DCI) {
53237 EVT VT = N->getValueType(0);
53238 SDValue In = N->getOperand(0);
53239 SDLoc DL(N);
53240
53241 if (SDValue SSatVal = detectSSatPattern(In, VT))
53242 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
53243 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
53244 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
53245
53246 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53247 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
53248 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53249 return SDValue(N, 0);
53250
53251 return SDValue();
53252}
53253
53254/// Returns the negated value if the node \p N flips sign of FP value.
53255///
53256/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53257/// or FSUB(0, x)
53258/// AVX512F does not have FXOR, so FNEG is lowered as
53259/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
53260/// In this case we go though all bitcasts.
53261/// This also recognizes splat of a negated value and returns the splat of that
53262/// value.
53263static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53264 if (N->getOpcode() == ISD::FNEG)
53265 return N->getOperand(0);
53266
53267 // Don't recurse exponentially.
53268 if (Depth > SelectionDAG::MaxRecursionDepth)
53269 return SDValue();
53270
53271 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53272
53273 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
53274 EVT VT = Op->getValueType(0);
53275
53276 // Make sure the element size doesn't change.
53277 if (VT.getScalarSizeInBits() != ScalarSize)
53278 return SDValue();
53279
53280 unsigned Opc = Op.getOpcode();
53281 switch (Opc) {
53282 case ISD::VECTOR_SHUFFLE: {
53283 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
53284 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
53285 if (!Op.getOperand(1).isUndef())
53286 return SDValue();
53287 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53288 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
53289 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
53290 cast<ShuffleVectorSDNode>(Op)->getMask());
53291 break;
53292 }
53293 case ISD::INSERT_VECTOR_ELT: {
53294 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
53295 // -V, INDEX).
53296 SDValue InsVector = Op.getOperand(0);
53297 SDValue InsVal = Op.getOperand(1);
53298 if (!InsVector.isUndef())
53299 return SDValue();
53300 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
53301 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
53302 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
53303 NegInsVal, Op.getOperand(2));
53304 break;
53305 }
53306 case ISD::FSUB:
53307 case ISD::XOR:
53308 case X86ISD::FXOR: {
53309 SDValue Op1 = Op.getOperand(1);
53310 SDValue Op0 = Op.getOperand(0);
53311
53312 // For XOR and FXOR, we want to check if constant
53313 // bits of Op1 are sign bit masks. For FSUB, we
53314 // have to check if constant bits of Op0 are sign
53315 // bit masks and hence we swap the operands.
53316 if (Opc == ISD::FSUB)
53317 std::swap(Op0, Op1);
53318
53319 APInt UndefElts;
53320 SmallVector<APInt, 16> EltBits;
53321 // Extract constant bits and see if they are all
53322 // sign bit masks. Ignore the undef elements.
53323 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
53324 /* AllowWholeUndefs */ true,
53325 /* AllowPartialUndefs */ false)) {
53326 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
53327 if (!UndefElts[I] && !EltBits[I].isSignMask())
53328 return SDValue();
53329
53330 // Only allow bitcast from correctly-sized constant.
53331 Op0 = peekThroughBitcasts(Op0);
53332 if (Op0.getScalarValueSizeInBits() == ScalarSize)
53333 return Op0;
53334 }
53335 break;
53336 } // case
53337 } // switch
53338
53339 return SDValue();
53340}
53341
53342static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
53343 bool NegRes) {
53344 if (NegMul) {
53345 switch (Opcode) {
53346 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53346)
;
53347 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
53348 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
53349 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
53350 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
53351 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
53352 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
53353 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
53354 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
53355 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
53356 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
53357 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
53358 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
53359 }
53360 }
53361
53362 if (NegAcc) {
53363 switch (Opcode) {
53364 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53364)
;
53365 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
53366 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
53367 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53368 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
53369 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
53370 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53371 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
53372 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
53373 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53374 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
53375 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
53376 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53377 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
53378 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
53379 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
53380 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
53381 }
53382 }
53383
53384 if (NegRes) {
53385 switch (Opcode) {
53386 // For accuracy reason, we never combine fneg and fma under strict FP.
53387 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53387)
;
53388 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
53389 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53390 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
53391 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53392 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
53393 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53394 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
53395 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53396 }
53397 }
53398
53399 return Opcode;
53400}
53401
53402/// Do target-specific dag combines on floating point negations.
53403static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
53404 TargetLowering::DAGCombinerInfo &DCI,
53405 const X86Subtarget &Subtarget) {
53406 EVT OrigVT = N->getValueType(0);
53407 SDValue Arg = isFNEG(DAG, N);
53408 if (!Arg)
53409 return SDValue();
53410
53411 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53412 EVT VT = Arg.getValueType();
53413 EVT SVT = VT.getScalarType();
53414 SDLoc DL(N);
53415
53416 // Let legalize expand this if it isn't a legal type yet.
53417 if (!TLI.isTypeLegal(VT))
53418 return SDValue();
53419
53420 // If we're negating a FMUL node on a target with FMA, then we can avoid the
53421 // use of a constant by performing (-0 - A*B) instead.
53422 // FIXME: Check rounding control flags as well once it becomes available.
53423 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
53424 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
53425 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
53426 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
53427 Arg.getOperand(1), Zero);
53428 return DAG.getBitcast(OrigVT, NewNode);
53429 }
53430
53431 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53432 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53433 if (SDValue NegArg =
53434 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
53435 return DAG.getBitcast(OrigVT, NegArg);
53436
53437 return SDValue();
53438}
53439
53440SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
53441 bool LegalOperations,
53442 bool ForCodeSize,
53443 NegatibleCost &Cost,
53444 unsigned Depth) const {
53445 // fneg patterns are removable even if they have multiple uses.
53446 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
53447 Cost = NegatibleCost::Cheaper;
53448 return DAG.getBitcast(Op.getValueType(), Arg);
53449 }
53450
53451 EVT VT = Op.getValueType();
53452 EVT SVT = VT.getScalarType();
53453 unsigned Opc = Op.getOpcode();
53454 SDNodeFlags Flags = Op.getNode()->getFlags();
53455 switch (Opc) {
53456 case ISD::FMA:
53457 case X86ISD::FMSUB:
53458 case X86ISD::FNMADD:
53459 case X86ISD::FNMSUB:
53460 case X86ISD::FMADD_RND:
53461 case X86ISD::FMSUB_RND:
53462 case X86ISD::FNMADD_RND:
53463 case X86ISD::FNMSUB_RND: {
53464 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
53465 !(SVT == MVT::f32 || SVT == MVT::f64) ||
53466 !isOperationLegal(ISD::FMA, VT))
53467 break;
53468
53469 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
53470 // if it may have signed zeros.
53471 if (!Flags.hasNoSignedZeros())
53472 break;
53473
53474 // This is always negatible for free but we might be able to remove some
53475 // extra operand negations as well.
53476 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
53477 for (int i = 0; i != 3; ++i)
53478 NewOps[i] = getCheaperNegatedExpression(
53479 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
53480
53481 bool NegA = !!NewOps[0];
53482 bool NegB = !!NewOps[1];
53483 bool NegC = !!NewOps[2];
53484 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
53485
53486 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
53487 : NegatibleCost::Neutral;
53488
53489 // Fill in the non-negated ops with the original values.
53490 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
53491 if (!NewOps[i])
53492 NewOps[i] = Op.getOperand(i);
53493 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
53494 }
53495 case X86ISD::FRCP:
53496 if (SDValue NegOp0 =
53497 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
53498 ForCodeSize, Cost, Depth + 1))
53499 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
53500 break;
53501 }
53502
53503 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
53504 ForCodeSize, Cost, Depth);
53505}
53506
53507static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
53508 const X86Subtarget &Subtarget) {
53509 MVT VT = N->getSimpleValueType(0);
53510 // If we have integer vector types available, use the integer opcodes.
53511 if (!VT.isVector() || !Subtarget.hasSSE2())
53512 return SDValue();
53513
53514 SDLoc dl(N);
53515
53516 unsigned IntBits = VT.getScalarSizeInBits();
53517 MVT IntSVT = MVT::getIntegerVT(IntBits);
53518 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
53519
53520 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
53521 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
53522 unsigned IntOpcode;
53523 switch (N->getOpcode()) {
53524 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53524)
;
53525 case X86ISD::FOR: IntOpcode = ISD::OR; break;
53526 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
53527 case X86ISD::FAND: IntOpcode = ISD::AND; break;
53528 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
53529 }
53530 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
53531 return DAG.getBitcast(VT, IntOp);
53532}
53533
53534
53535/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
53536static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
53537 if (N->getOpcode() != ISD::XOR)
53538 return SDValue();
53539
53540 SDValue LHS = N->getOperand(0);
53541 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
53542 return SDValue();
53543
53544 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
53545 X86::CondCode(LHS->getConstantOperandVal(0)));
53546 SDLoc DL(N);
53547 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
53548}
53549
53550static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
53551 const X86Subtarget &Subtarget) {
53552 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__
__PRETTY_FUNCTION__))
53553 "Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__
__PRETTY_FUNCTION__))
;
53554 if (Subtarget.hasFastLZCNT())
53555 return SDValue();
53556
53557 EVT VT = N->getValueType(0);
53558 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
53559 (VT != MVT::i64 || !Subtarget.is64Bit()))
53560 return SDValue();
53561
53562 SDValue N0 = N->getOperand(0);
53563 SDValue N1 = N->getOperand(1);
53564
53565 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
53566 N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
53567 return SDValue();
53568
53569 SDValue OpCTLZ;
53570 SDValue OpSizeTM1;
53571
53572 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
53573 OpCTLZ = N1;
53574 OpSizeTM1 = N0;
53575 } else if (N->getOpcode() == ISD::SUB) {
53576 return SDValue();
53577 } else {
53578 OpCTLZ = N0;
53579 OpSizeTM1 = N1;
53580 }
53581
53582 if (!OpCTLZ.hasOneUse())
53583 return SDValue();
53584 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
53585 if (!C)
53586 return SDValue();
53587
53588 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
53589 return SDValue();
53590 SDLoc DL(N);
53591 EVT OpVT = VT;
53592 SDValue Op = OpCTLZ.getOperand(0);
53593 if (VT == MVT::i8) {
53594 // Zero extend to i32 since there is not an i8 bsr.
53595 OpVT = MVT::i32;
53596 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
53597 }
53598
53599 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
53600 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
53601 if (VT == MVT::i8)
53602 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
53603
53604 return Op;
53605}
53606
53607static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
53608 TargetLowering::DAGCombinerInfo &DCI,
53609 const X86Subtarget &Subtarget) {
53610 SDValue N0 = N->getOperand(0);
53611 SDValue N1 = N->getOperand(1);
53612 EVT VT = N->getValueType(0);
53613
53614 // If this is SSE1 only convert to FXOR to avoid scalarization.
53615 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
53616 return DAG.getBitcast(MVT::v4i32,
53617 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
53618 DAG.getBitcast(MVT::v4f32, N0),
53619 DAG.getBitcast(MVT::v4f32, N1)));
53620 }
53621
53622 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
53623 return Cmp;
53624
53625 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
53626 return R;
53627
53628 if (SDValue R = combineBitOpWithShift(N, DAG))
53629 return R;
53630
53631 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
53632 return FPLogic;
53633
53634 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
53635 return R;
53636
53637 if (DCI.isBeforeLegalizeOps())
53638 return SDValue();
53639
53640 if (SDValue SetCC = foldXor1SetCC(N, DAG))
53641 return SetCC;
53642
53643 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
53644 return R;
53645
53646 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
53647 return RV;
53648
53649 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
53650 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53651 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
53652 N0.getOperand(0).getValueType().isVector() &&
53653 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53654 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
53655 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
53656 N0.getOperand(0).getValueType()));
53657 }
53658
53659 // Handle AVX512 mask widening.
53660 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
53661 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
53662 VT.getVectorElementType() == MVT::i1 &&
53663 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
53664 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
53665 return DAG.getNode(
53666 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
53667 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
53668 N0.getOperand(2));
53669 }
53670
53671 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
53672 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
53673 // TODO: Under what circumstances could this be performed in DAGCombine?
53674 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
53675 N0.getOperand(0).getOpcode() == N->getOpcode()) {
53676 SDValue TruncExtSrc = N0.getOperand(0);
53677 auto *N1C = dyn_cast<ConstantSDNode>(N1);
53678 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
53679 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
53680 SDLoc DL(N);
53681 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
53682 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
53683 return DAG.getNode(ISD::XOR, DL, VT, LHS,
53684 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
53685 }
53686 }
53687
53688 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
53689 return R;
53690
53691 return combineFneg(N, DAG, DCI, Subtarget);
53692}
53693
53694static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
53695 TargetLowering::DAGCombinerInfo &DCI,
53696 const X86Subtarget &Subtarget) {
53697 EVT VT = N->getValueType(0);
53698 unsigned NumBits = VT.getSizeInBits();
53699
53700 // TODO - Constant Folding.
53701
53702 // Simplify the inputs.
53703 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53704 APInt DemandedMask(APInt::getAllOnes(NumBits));
53705 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53706 return SDValue(N, 0);
53707
53708 return SDValue();
53709}
53710
53711static bool isNullFPScalarOrVectorConst(SDValue V) {
53712 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
53713}
53714
53715/// If a value is a scalar FP zero or a vector FP zero (potentially including
53716/// undefined elements), return a zero constant that may be used to fold away
53717/// that value. In the case of a vector, the returned constant will not contain
53718/// undefined elements even if the input parameter does. This makes it suitable
53719/// to be used as a replacement operand with operations (eg, bitwise-and) where
53720/// an undef should not propagate.
53721static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
53722 const X86Subtarget &Subtarget) {
53723 if (!isNullFPScalarOrVectorConst(V))
53724 return SDValue();
53725
53726 if (V.getValueType().isVector())
53727 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
53728
53729 return V;
53730}
53731
53732static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
53733 const X86Subtarget &Subtarget) {
53734 SDValue N0 = N->getOperand(0);
53735 SDValue N1 = N->getOperand(1);
53736 EVT VT = N->getValueType(0);
53737 SDLoc DL(N);
53738
53739 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
53740 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
53741 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
53742 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
53743 return SDValue();
53744
53745 auto isAllOnesConstantFP = [](SDValue V) {
53746 if (V.getSimpleValueType().isVector())
53747 return ISD::isBuildVectorAllOnes(V.getNode());
53748 auto *C = dyn_cast<ConstantFPSDNode>(V);
53749 return C && C->getConstantFPValue()->isAllOnesValue();
53750 };
53751
53752 // fand (fxor X, -1), Y --> fandn X, Y
53753 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
53754 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
53755
53756 // fand X, (fxor Y, -1) --> fandn Y, X
53757 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
53758 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
53759
53760 return SDValue();
53761}
53762
53763/// Do target-specific dag combines on X86ISD::FAND nodes.
53764static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
53765 const X86Subtarget &Subtarget) {
53766 // FAND(0.0, x) -> 0.0
53767 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
53768 return V;
53769
53770 // FAND(x, 0.0) -> 0.0
53771 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53772 return V;
53773
53774 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
53775 return V;
53776
53777 return lowerX86FPLogicOp(N, DAG, Subtarget);
53778}
53779
53780/// Do target-specific dag combines on X86ISD::FANDN nodes.
53781static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
53782 const X86Subtarget &Subtarget) {
53783 // FANDN(0.0, x) -> x
53784 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53785 return N->getOperand(1);
53786
53787 // FANDN(x, 0.0) -> 0.0
53788 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53789 return V;
53790
53791 return lowerX86FPLogicOp(N, DAG, Subtarget);
53792}
53793
53794/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
53795static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
53796 TargetLowering::DAGCombinerInfo &DCI,
53797 const X86Subtarget &Subtarget) {
53798 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53798, __extension__
__PRETTY_FUNCTION__))
;
53799
53800 // F[X]OR(0.0, x) -> x
53801 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53802 return N->getOperand(1);
53803
53804 // F[X]OR(x, 0.0) -> x
53805 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
53806 return N->getOperand(0);
53807
53808 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
53809 return NewVal;
53810
53811 return lowerX86FPLogicOp(N, DAG, Subtarget);
53812}
53813
53814/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
53815static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
53816 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53816, __extension__
__PRETTY_FUNCTION__))
;
53817
53818 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
53819 if (!DAG.getTarget().Options.NoNaNsFPMath ||
53820 !DAG.getTarget().Options.NoSignedZerosFPMath)
53821 return SDValue();
53822
53823 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
53824 // into FMINC and FMAXC, which are Commutative operations.
53825 unsigned NewOp = 0;
53826 switch (N->getOpcode()) {
53827 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53827)
;
53828 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
53829 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
53830 }
53831
53832 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
53833 N->getOperand(0), N->getOperand(1));
53834}
53835
53836static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
53837 const X86Subtarget &Subtarget) {
53838 EVT VT = N->getValueType(0);
53839 if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
53840 return SDValue();
53841
53842 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53843
53844 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
53845 (Subtarget.hasSSE2() && VT == MVT::f64) ||
53846 (Subtarget.hasFP16() && VT == MVT::f16) ||
53847 (VT.isVector() && TLI.isTypeLegal(VT))))
53848 return SDValue();
53849
53850 SDValue Op0 = N->getOperand(0);
53851 SDValue Op1 = N->getOperand(1);
53852 SDLoc DL(N);
53853 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
53854
53855 // If we don't have to respect NaN inputs, this is a direct translation to x86
53856 // min/max instructions.
53857 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
53858 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53859
53860 // If one of the operands is known non-NaN use the native min/max instructions
53861 // with the non-NaN input as second operand.
53862 if (DAG.isKnownNeverNaN(Op1))
53863 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53864 if (DAG.isKnownNeverNaN(Op0))
53865 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
53866
53867 // If we have to respect NaN inputs, this takes at least 3 instructions.
53868 // Favor a library call when operating on a scalar and minimizing code size.
53869 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
53870 return SDValue();
53871
53872 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
53873 VT);
53874
53875 // There are 4 possibilities involving NaN inputs, and these are the required
53876 // outputs:
53877 // Op1
53878 // Num NaN
53879 // ----------------
53880 // Num | Max | Op0 |
53881 // Op0 ----------------
53882 // NaN | Op1 | NaN |
53883 // ----------------
53884 //
53885 // The SSE FP max/min instructions were not designed for this case, but rather
53886 // to implement:
53887 // Min = Op1 < Op0 ? Op1 : Op0
53888 // Max = Op1 > Op0 ? Op1 : Op0
53889 //
53890 // So they always return Op0 if either input is a NaN. However, we can still
53891 // use those instructions for fmaxnum by selecting away a NaN input.
53892
53893 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
53894 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
53895 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
53896
53897 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
53898 // are NaN, the NaN value of Op1 is the result.
53899 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
53900}
53901
53902static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
53903 TargetLowering::DAGCombinerInfo &DCI) {
53904 EVT VT = N->getValueType(0);
53905 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53906
53907 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53908 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
53909 return SDValue(N, 0);
53910
53911 // Convert a full vector load into vzload when not all bits are needed.
53912 SDValue In = N->getOperand(0);
53913 MVT InVT = In.getSimpleValueType();
53914 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53915 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53916 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53916, __extension__
__PRETTY_FUNCTION__))
;
53917 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
53918 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53919 MVT MemVT = MVT::getIntegerVT(NumBits);
53920 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53921 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53922 SDLoc dl(N);
53923 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
53924 DAG.getBitcast(InVT, VZLoad));
53925 DCI.CombineTo(N, Convert);
53926 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53927 DCI.recursivelyDeleteUnusedNodes(LN);
53928 return SDValue(N, 0);
53929 }
53930 }
53931
53932 return SDValue();
53933}
53934
53935static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
53936 TargetLowering::DAGCombinerInfo &DCI) {
53937 bool IsStrict = N->isTargetStrictFPOpcode();
53938 EVT VT = N->getValueType(0);
53939
53940 // Convert a full vector load into vzload when not all bits are needed.
53941 SDValue In = N->getOperand(IsStrict ? 1 : 0);
53942 MVT InVT = In.getSimpleValueType();
53943 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53944 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53945 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53945, __extension__
__PRETTY_FUNCTION__))
;
53946 LoadSDNode *LN = cast<LoadSDNode>(In);
53947 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53948 MVT MemVT = MVT::getFloatingPointVT(NumBits);
53949 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53950 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53951 SDLoc dl(N);
53952 if (IsStrict) {
53953 SDValue Convert =
53954 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
53955 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
53956 DCI.CombineTo(N, Convert, Convert.getValue(1));
53957 } else {
53958 SDValue Convert =
53959 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
53960 DCI.CombineTo(N, Convert);
53961 }
53962 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53963 DCI.recursivelyDeleteUnusedNodes(LN);
53964 return SDValue(N, 0);
53965 }
53966 }
53967
53968 return SDValue();
53969}
53970
53971/// Do target-specific dag combines on X86ISD::ANDNP nodes.
53972static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
53973 TargetLowering::DAGCombinerInfo &DCI,
53974 const X86Subtarget &Subtarget) {
53975 SDValue N0 = N->getOperand(0);
53976 SDValue N1 = N->getOperand(1);
53977 MVT VT = N->getSimpleValueType(0);
53978 int NumElts = VT.getVectorNumElements();
53979 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53980
53981 // ANDNP(undef, x) -> 0
53982 // ANDNP(x, undef) -> 0
53983 if (N0.isUndef() || N1.isUndef())
53984 return DAG.getConstant(0, SDLoc(N), VT);
53985
53986 // ANDNP(0, x) -> x
53987 if (ISD::isBuildVectorAllZeros(N0.getNode()))
53988 return N1;
53989
53990 // ANDNP(x, 0) -> 0
53991 if (ISD::isBuildVectorAllZeros(N1.getNode()))
53992 return DAG.getConstant(0, SDLoc(N), VT);
53993
53994 // Turn ANDNP back to AND if input is inverted.
53995 if (SDValue Not = IsNOT(N0, DAG))
53996 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
53997
53998 // Constant Folding
53999 APInt Undefs0, Undefs1;
54000 SmallVector<APInt> EltBits0, EltBits1;
54001 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
54002 SDLoc DL(N);
54003 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
54004 SmallVector<APInt> ResultBits;
54005 for (int I = 0; I != NumElts; ++I)
54006 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
54007 return getConstVector(ResultBits, VT, DAG, DL);
54008 }
54009
54010 // Constant fold NOT(N0) to allow us to use AND.
54011 // Ensure this is only performed if we can confirm that the bitcasted source
54012 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
54013 if (N0->hasOneUse()) {
54014 SDValue BC0 = peekThroughOneUseBitcasts(N0);
54015 if (BC0.getOpcode() != ISD::BITCAST) {
54016 for (APInt &Elt : EltBits0)
54017 Elt = ~Elt;
54018 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
54019 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
54020 }
54021 }
54022 }
54023
54024 // Attempt to recursively combine a bitmask ANDNP with shuffles.
54025 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
54026 SDValue Op(N, 0);
54027 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54028 return Res;
54029
54030 // If either operand is a constant mask, then only the elements that aren't
54031 // zero are actually demanded by the other operand.
54032 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
54033 APInt UndefElts;
54034 SmallVector<APInt> EltBits;
54035 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
54036 APInt DemandedElts = APInt::getAllOnes(NumElts);
54037 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
54038 EltBits)) {
54039 DemandedBits.clearAllBits();
54040 DemandedElts.clearAllBits();
54041 for (int I = 0; I != NumElts; ++I) {
54042 if (UndefElts[I]) {
54043 // We can't assume an undef src element gives an undef dst - the
54044 // other src might be zero.
54045 DemandedBits.setAllBits();
54046 DemandedElts.setBit(I);
54047 } else if ((Invert && !EltBits[I].isAllOnes()) ||
54048 (!Invert && !EltBits[I].isZero())) {
54049 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
54050 DemandedElts.setBit(I);
54051 }
54052 }
54053 }
54054 return std::make_pair(DemandedBits, DemandedElts);
54055 };
54056 APInt Bits0, Elts0;
54057 APInt Bits1, Elts1;
54058 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
54059 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
54060
54061 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54062 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
54063 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
54064 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
54065 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
54066 if (N->getOpcode() != ISD::DELETED_NODE)
54067 DCI.AddToWorklist(N);
54068 return SDValue(N, 0);
54069 }
54070 }
54071
54072 return SDValue();
54073}
54074
54075static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
54076 TargetLowering::DAGCombinerInfo &DCI) {
54077 SDValue N1 = N->getOperand(1);
54078
54079 // BT ignores high bits in the bit index operand.
54080 unsigned BitWidth = N1.getValueSizeInBits();
54081 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
54082 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
54083 if (N->getOpcode() != ISD::DELETED_NODE)
54084 DCI.AddToWorklist(N);
54085 return SDValue(N, 0);
54086 }
54087
54088 return SDValue();
54089}
54090
54091static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
54092 TargetLowering::DAGCombinerInfo &DCI) {
54093 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54094 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54095
54096 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54097 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54098 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
54099 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
54100 if (N->getOpcode() != ISD::DELETED_NODE)
54101 DCI.AddToWorklist(N);
54102 return SDValue(N, 0);
54103 }
54104
54105 // Convert a full vector load into vzload when not all bits are needed.
54106 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54107 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54108 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
54109 SDLoc dl(N);
54110 if (IsStrict) {
54111 SDValue Convert = DAG.getNode(
54112 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54113 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54114 DCI.CombineTo(N, Convert, Convert.getValue(1));
54115 } else {
54116 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54117 DAG.getBitcast(MVT::v8i16, VZLoad));
54118 DCI.CombineTo(N, Convert);
54119 }
54120
54121 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54122 DCI.recursivelyDeleteUnusedNodes(LN);
54123 return SDValue(N, 0);
54124 }
54125 }
54126 }
54127
54128 return SDValue();
54129}
54130
54131// Try to combine sext_in_reg of a cmov of constants by extending the constants.
54132static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
54133 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54133, __extension__
__PRETTY_FUNCTION__))
;
54134
54135 EVT DstVT = N->getValueType(0);
54136
54137 SDValue N0 = N->getOperand(0);
54138 SDValue N1 = N->getOperand(1);
54139 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54140
54141 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
54142 return SDValue();
54143
54144 // Look through single use any_extends / truncs.
54145 SDValue IntermediateBitwidthOp;
54146 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
54147 N0.hasOneUse()) {
54148 IntermediateBitwidthOp = N0;
54149 N0 = N0.getOperand(0);
54150 }
54151
54152 // See if we have a single use cmov.
54153 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
54154 return SDValue();
54155
54156 SDValue CMovOp0 = N0.getOperand(0);
54157 SDValue CMovOp1 = N0.getOperand(1);
54158
54159 // Make sure both operands are constants.
54160 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54161 !isa<ConstantSDNode>(CMovOp1.getNode()))
54162 return SDValue();
54163
54164 SDLoc DL(N);
54165
54166 // If we looked through an any_extend/trunc above, add one to the constants.
54167 if (IntermediateBitwidthOp) {
54168 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
54169 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
54170 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
54171 }
54172
54173 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
54174 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
54175
54176 EVT CMovVT = DstVT;
54177 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
54178 if (DstVT == MVT::i16) {
54179 CMovVT = MVT::i32;
54180 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
54181 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
54182 }
54183
54184 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
54185 N0.getOperand(2), N0.getOperand(3));
54186
54187 if (CMovVT != DstVT)
54188 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
54189
54190 return CMov;
54191}
54192
54193static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
54194 const X86Subtarget &Subtarget) {
54195 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54195, __extension__
__PRETTY_FUNCTION__))
;
54196
54197 if (SDValue V = combineSextInRegCmov(N, DAG))
54198 return V;
54199
54200 EVT VT = N->getValueType(0);
54201 SDValue N0 = N->getOperand(0);
54202 SDValue N1 = N->getOperand(1);
54203 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54204 SDLoc dl(N);
54205
54206 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
54207 // both SSE and AVX2 since there is no sign-extended shift right
54208 // operation on a vector with 64-bit elements.
54209 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
54210 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
54211 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
54212 N0.getOpcode() == ISD::SIGN_EXTEND)) {
54213 SDValue N00 = N0.getOperand(0);
54214
54215 // EXTLOAD has a better solution on AVX2,
54216 // it may be replaced with X86ISD::VSEXT node.
54217 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
54218 if (!ISD::isNormalLoad(N00.getNode()))
54219 return SDValue();
54220
54221 // Attempt to promote any comparison mask ops before moving the
54222 // SIGN_EXTEND_INREG in the way.
54223 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
54224 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
54225
54226 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
54227 SDValue Tmp =
54228 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
54229 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
54230 }
54231 }
54232 return SDValue();
54233}
54234
54235/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
54236/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
54237/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
54238/// opportunities to combine math ops, use an LEA, or use a complex addressing
54239/// mode. This can eliminate extend, add, and shift instructions.
54240static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
54241 const X86Subtarget &Subtarget) {
54242 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
54243 Ext->getOpcode() != ISD::ZERO_EXTEND)
54244 return SDValue();
54245
54246 // TODO: This should be valid for other integer types.
54247 EVT VT = Ext->getValueType(0);
54248 if (VT != MVT::i64)
54249 return SDValue();
54250
54251 SDValue Add = Ext->getOperand(0);
54252 if (Add.getOpcode() != ISD::ADD)
54253 return SDValue();
54254
54255 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
54256 bool NSW = Add->getFlags().hasNoSignedWrap();
54257 bool NUW = Add->getFlags().hasNoUnsignedWrap();
54258
54259 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
54260 // into the 'zext'
54261 if ((Sext && !NSW) || (!Sext && !NUW))
54262 return SDValue();
54263
54264 // Having a constant operand to the 'add' ensures that we are not increasing
54265 // the instruction count because the constant is extended for free below.
54266 // A constant operand can also become the displacement field of an LEA.
54267 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
54268 if (!AddOp1)
54269 return SDValue();
54270
54271 // Don't make the 'add' bigger if there's no hope of combining it with some
54272 // other 'add' or 'shl' instruction.
54273 // TODO: It may be profitable to generate simpler LEA instructions in place
54274 // of single 'add' instructions, but the cost model for selecting an LEA
54275 // currently has a high threshold.
54276 bool HasLEAPotential = false;
54277 for (auto *User : Ext->uses()) {
54278 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
54279 HasLEAPotential = true;
54280 break;
54281 }
54282 }
54283 if (!HasLEAPotential)
54284 return SDValue();
54285
54286 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
54287 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
54288 SDValue AddOp0 = Add.getOperand(0);
54289 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
54290 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
54291
54292 // The wider add is guaranteed to not wrap because both operands are
54293 // sign-extended.
54294 SDNodeFlags Flags;
54295 Flags.setNoSignedWrap(NSW);
54296 Flags.setNoUnsignedWrap(NUW);
54297 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
54298}
54299
54300// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
54301// operands and the result of CMOV is not used anywhere else - promote CMOV
54302// itself instead of promoting its result. This could be beneficial, because:
54303// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
54304// (or more) pseudo-CMOVs only when they go one-after-another and
54305// getting rid of result extension code after CMOV will help that.
54306// 2) Promotion of constant CMOV arguments is free, hence the
54307// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
54308// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
54309// promotion is also good in terms of code-size.
54310// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
54311// promotion).
54312static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
54313 SDValue CMovN = Extend->getOperand(0);
54314 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
54315 return SDValue();
54316
54317 EVT TargetVT = Extend->getValueType(0);
54318 unsigned ExtendOpcode = Extend->getOpcode();
54319 SDLoc DL(Extend);
54320
54321 EVT VT = CMovN.getValueType();
54322 SDValue CMovOp0 = CMovN.getOperand(0);
54323 SDValue CMovOp1 = CMovN.getOperand(1);
54324
54325 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54326 !isa<ConstantSDNode>(CMovOp1.getNode()))
54327 return SDValue();
54328
54329 // Only extend to i32 or i64.
54330 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
54331 return SDValue();
54332
54333 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
54334 // are free.
54335 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
54336 return SDValue();
54337
54338 // If this a zero extend to i64, we should only extend to i32 and use a free
54339 // zero extend to finish.
54340 EVT ExtendVT = TargetVT;
54341 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
54342 ExtendVT = MVT::i32;
54343
54344 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
54345 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
54346
54347 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
54348 CMovN.getOperand(2), CMovN.getOperand(3));
54349
54350 // Finish extending if needed.
54351 if (ExtendVT != TargetVT)
54352 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
54353
54354 return Res;
54355}
54356
54357// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
54358// result type.
54359static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
54360 const X86Subtarget &Subtarget) {
54361 SDValue N0 = N->getOperand(0);
54362 EVT VT = N->getValueType(0);
54363 SDLoc dl(N);
54364
54365 // Only do this combine with AVX512 for vector extends.
54366 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
54367 return SDValue();
54368
54369 // Only combine legal element types.
54370 EVT SVT = VT.getVectorElementType();
54371 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
54372 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
54373 return SDValue();
54374
54375 // We don't have CMPP Instruction for vxf16
54376 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
54377 return SDValue();
54378 // We can only do this if the vector size in 256 bits or less.
54379 unsigned Size = VT.getSizeInBits();
54380 if (Size > 256 && Subtarget.useAVX512Regs())
54381 return SDValue();
54382
54383 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
54384 // that's the only integer compares with we have.
54385 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
54386 if (ISD::isUnsignedIntSetCC(CC))
54387 return SDValue();
54388
54389 // Only do this combine if the extension will be fully consumed by the setcc.
54390 EVT N00VT = N0.getOperand(0).getValueType();
54391 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
54392 if (Size != MatchingVecType.getSizeInBits())
54393 return SDValue();
54394
54395 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
54396
54397 if (N->getOpcode() == ISD::ZERO_EXTEND)
54398 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
54399
54400 return Res;
54401}
54402
54403static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
54404 TargetLowering::DAGCombinerInfo &DCI,
54405 const X86Subtarget &Subtarget) {
54406 SDValue N0 = N->getOperand(0);
54407 EVT VT = N->getValueType(0);
54408 SDLoc DL(N);
54409
54410 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54411 if (!DCI.isBeforeLegalizeOps() &&
54412 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54413 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
54414 N0->getOperand(1));
54415 bool ReplaceOtherUses = !N0.hasOneUse();
54416 DCI.CombineTo(N, Setcc);
54417 // Replace other uses with a truncate of the widened setcc_carry.
54418 if (ReplaceOtherUses) {
54419 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54420 N0.getValueType(), Setcc);
54421 DCI.CombineTo(N0.getNode(), Trunc);
54422 }
54423
54424 return SDValue(N, 0);
54425 }
54426
54427 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54428 return NewCMov;
54429
54430 if (!DCI.isBeforeLegalizeOps())
54431 return SDValue();
54432
54433 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54434 return V;
54435
54436 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
54437 DAG, DCI, Subtarget))
54438 return V;
54439
54440 if (VT.isVector()) {
54441 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54442 return R;
54443
54444 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
54445 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
54446 }
54447
54448 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54449 return NewAdd;
54450
54451 return SDValue();
54452}
54453
54454static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
54455 TargetLowering::DAGCombinerInfo &DCI,
54456 const X86Subtarget &Subtarget) {
54457 SDLoc dl(N);
54458 EVT VT = N->getValueType(0);
54459 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
54460
54461 // Let legalize expand this if it isn't a legal type yet.
54462 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54463 if (!TLI.isTypeLegal(VT))
54464 return SDValue();
54465
54466 SDValue A = N->getOperand(IsStrict ? 1 : 0);
54467 SDValue B = N->getOperand(IsStrict ? 2 : 1);
54468 SDValue C = N->getOperand(IsStrict ? 3 : 2);
54469
54470 // If the operation allows fast-math and the target does not support FMA,
54471 // split this into mul+add to avoid libcall(s).
54472 SDNodeFlags Flags = N->getFlags();
54473 if (!IsStrict && Flags.hasAllowReassociation() &&
54474 TLI.isOperationExpand(ISD::FMA, VT)) {
54475 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
54476 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
54477 }
54478
54479 EVT ScalarVT = VT.getScalarType();
54480 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
54481 !Subtarget.hasAnyFMA()) &&
54482 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
54483 return SDValue();
54484
54485 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
54486 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54487 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54488 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
54489 CodeSize)) {
54490 V = NegV;
54491 return true;
54492 }
54493 // Look through extract_vector_elts. If it comes from an FNEG, create a
54494 // new extract from the FNEG input.
54495 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54496 isNullConstant(V.getOperand(1))) {
54497 SDValue Vec = V.getOperand(0);
54498 if (SDValue NegV = TLI.getCheaperNegatedExpression(
54499 Vec, DAG, LegalOperations, CodeSize)) {
54500 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
54501 NegV, V.getOperand(1));
54502 return true;
54503 }
54504 }
54505
54506 return false;
54507 };
54508
54509 // Do not convert the passthru input of scalar intrinsics.
54510 // FIXME: We could allow negations of the lower element only.
54511 bool NegA = invertIfNegative(A);
54512 bool NegB = invertIfNegative(B);
54513 bool NegC = invertIfNegative(C);
54514
54515 if (!NegA && !NegB && !NegC)
54516 return SDValue();
54517
54518 unsigned NewOpcode =
54519 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
54520
54521 // Propagate fast-math-flags to new FMA node.
54522 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
54523 if (IsStrict) {
54524 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54524, __extension__
__PRETTY_FUNCTION__))
;
54525 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
54526 {N->getOperand(0), A, B, C});
54527 } else {
54528 if (N->getNumOperands() == 4)
54529 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
54530 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
54531 }
54532}
54533
54534// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
54535// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
54536static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
54537 TargetLowering::DAGCombinerInfo &DCI) {
54538 SDLoc dl(N);
54539 EVT VT = N->getValueType(0);
54540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54541 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54542 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54543
54544 SDValue N2 = N->getOperand(2);
54545
54546 SDValue NegN2 =
54547 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
54548 if (!NegN2)
54549 return SDValue();
54550 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
54551
54552 if (N->getNumOperands() == 4)
54553 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54554 NegN2, N->getOperand(3));
54555 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54556 NegN2);
54557}
54558
54559static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
54560 TargetLowering::DAGCombinerInfo &DCI,
54561 const X86Subtarget &Subtarget) {
54562 SDLoc dl(N);
54563 SDValue N0 = N->getOperand(0);
54564 EVT VT = N->getValueType(0);
54565
54566 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54567 // FIXME: Is this needed? We don't seem to have any tests for it.
54568 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
54569 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54570 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
54571 N0->getOperand(1));
54572 bool ReplaceOtherUses = !N0.hasOneUse();
54573 DCI.CombineTo(N, Setcc);
54574 // Replace other uses with a truncate of the widened setcc_carry.
54575 if (ReplaceOtherUses) {
54576 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54577 N0.getValueType(), Setcc);
54578 DCI.CombineTo(N0.getNode(), Trunc);
54579 }
54580
54581 return SDValue(N, 0);
54582 }
54583
54584 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54585 return NewCMov;
54586
54587 if (DCI.isBeforeLegalizeOps())
54588 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54589 return V;
54590
54591 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
54592 DAG, DCI, Subtarget))
54593 return V;
54594
54595 if (VT.isVector())
54596 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54597 return R;
54598
54599 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54600 return NewAdd;
54601
54602 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
54603 return R;
54604
54605 // TODO: Combine with any target/faux shuffle.
54606 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
54607 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
54608 SDValue N00 = N0.getOperand(0);
54609 SDValue N01 = N0.getOperand(1);
54610 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
54611 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
54612 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
54613 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
54614 return concatSubVectors(N00, N01, DAG, dl);
54615 }
54616 }
54617
54618 return SDValue();
54619}
54620
54621/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
54622/// pre-promote its result type since vXi1 vectors don't get promoted
54623/// during type legalization.
54624static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
54625 SDValue RHS, ISD::CondCode CC,
54626 const SDLoc &DL, SelectionDAG &DAG,
54627 const X86Subtarget &Subtarget) {
54628 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
54629 VT.getVectorElementType() == MVT::i1 &&
54630 (OpVT.getVectorElementType() == MVT::i8 ||
54631 OpVT.getVectorElementType() == MVT::i16)) {
54632 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
54633 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
54634 }
54635 return SDValue();
54636}
54637
54638static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
54639 TargetLowering::DAGCombinerInfo &DCI,
54640 const X86Subtarget &Subtarget) {
54641 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
54642 const SDValue LHS = N->getOperand(0);
54643 const SDValue RHS = N->getOperand(1);
54644 EVT VT = N->getValueType(0);
54645 EVT OpVT = LHS.getValueType();
54646 SDLoc DL(N);
54647
54648 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
54649 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
54650 Subtarget))
54651 return V;
54652
54653 if (VT == MVT::i1) {
54654 X86::CondCode X86CC;
54655 if (SDValue V =
54656 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
54657 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
54658 }
54659
54660 if (OpVT.isScalarInteger()) {
54661 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
54662 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
54663 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
54664 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
54665 if (N0.getOperand(0) == N1)
54666 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54667 N0.getOperand(1));
54668 if (N0.getOperand(1) == N1)
54669 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54670 N0.getOperand(0));
54671 }
54672 return SDValue();
54673 };
54674 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
54675 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54676 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
54677 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54678
54679 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
54680 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
54681 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
54682 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
54683 if (N0.getOperand(0) == N1)
54684 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54685 DAG.getNOT(DL, N0.getOperand(1), OpVT));
54686 if (N0.getOperand(1) == N1)
54687 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54688 DAG.getNOT(DL, N0.getOperand(0), OpVT));
54689 }
54690 return SDValue();
54691 };
54692 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
54693 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54694 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
54695 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54696
54697 // cmpeq(trunc(x),C) --> cmpeq(x,C)
54698 // cmpne(trunc(x),C) --> cmpne(x,C)
54699 // iff x upper bits are zero.
54700 if (LHS.getOpcode() == ISD::TRUNCATE &&
54701 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
54702 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
54703 EVT SrcVT = LHS.getOperand(0).getValueType();
54704 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
54705 OpVT.getScalarSizeInBits());
54706 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54707 auto *C = cast<ConstantSDNode>(RHS);
54708 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
54709 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
54710 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
54711 DAG.getConstant(C->getAPIntValue().zextOrTrunc(
54712 SrcVT.getScalarSizeInBits()),
54713 DL, SrcVT),
54714 CC);
54715 }
54716
54717 // With C as a power of 2 and C != 0 and C != INT_MIN:
54718 // icmp eq Abs(X) C ->
54719 // (icmp eq A, C) | (icmp eq A, -C)
54720 // icmp ne Abs(X) C ->
54721 // (icmp ne A, C) & (icmp ne A, -C)
54722 // Both of these patterns can be better optimized in
54723 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
54724 // integers which is checked above.
54725 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
54726 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
54727 const APInt &CInt = C->getAPIntValue();
54728 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
54729 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
54730 SDValue BaseOp = LHS.getOperand(0);
54731 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
54732 SDValue SETCC1 = DAG.getSetCC(
54733 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
54734 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
54735 SETCC0, SETCC1);
54736 }
54737 }
54738 }
54739 }
54740 }
54741
54742 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
54743 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
54744 // Using temporaries to avoid messing up operand ordering for later
54745 // transformations if this doesn't work.
54746 SDValue Op0 = LHS;
54747 SDValue Op1 = RHS;
54748 ISD::CondCode TmpCC = CC;
54749 // Put build_vector on the right.
54750 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
54751 std::swap(Op0, Op1);
54752 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
54753 }
54754
54755 bool IsSEXT0 =
54756 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
54757 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
54758 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
54759
54760 if (IsSEXT0 && IsVZero1) {
54761 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))
54762 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))
;
54763 if (TmpCC == ISD::SETGT)
54764 return DAG.getConstant(0, DL, VT);
54765 if (TmpCC == ISD::SETLE)
54766 return DAG.getConstant(1, DL, VT);
54767 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
54768 return DAG.getNOT(DL, Op0.getOperand(0), VT);
54769
54770 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__
__PRETTY_FUNCTION__))
54771 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__
__PRETTY_FUNCTION__))
;
54772 return Op0.getOperand(0);
54773 }
54774 }
54775
54776 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
54777 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
54778 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
54779 // a mask, there are signed AVX512 comparisons).
54780 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
54781 bool CanMakeSigned = false;
54782 if (ISD::isUnsignedIntSetCC(CC)) {
54783 KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),
54784 DAG.computeKnownBits(RHS));
54785 // If we know LHS/RHS share the same sign bit at each element we can
54786 // make this signed.
54787 // NOTE: `computeKnownBits` on a vector type aggregates common bits
54788 // across all lanes. So a pattern where the sign varies from lane to
54789 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
54790 // missed. We could get around this by demanding each lane
54791 // independently, but this isn't the most important optimization and
54792 // that may eat into compile time.
54793 CanMakeSigned =
54794 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
54795 }
54796 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
54797 SDValue LHSOut = LHS;
54798 SDValue RHSOut = RHS;
54799 ISD::CondCode NewCC = CC;
54800 switch (CC) {
54801 case ISD::SETGE:
54802 case ISD::SETUGE:
54803 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
54804 /*NSW*/ true))
54805 LHSOut = NewLHS;
54806 else if (SDValue NewRHS = incDecVectorConstant(
54807 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
54808 RHSOut = NewRHS;
54809 else
54810 break;
54811
54812 [[fallthrough]];
54813 case ISD::SETUGT:
54814 NewCC = ISD::SETGT;
54815 break;
54816
54817 case ISD::SETLE:
54818 case ISD::SETULE:
54819 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
54820 /*NSW*/ true))
54821 LHSOut = NewLHS;
54822 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
54823 /*NSW*/ true))
54824 RHSOut = NewRHS;
54825 else
54826 break;
54827
54828 [[fallthrough]];
54829 case ISD::SETULT:
54830 // Will be swapped to SETGT in LowerVSETCC*.
54831 NewCC = ISD::SETLT;
54832 break;
54833 default:
54834 break;
54835 }
54836 if (NewCC != CC) {
54837 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
54838 NewCC, DL, DAG, Subtarget))
54839 return R;
54840 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
54841 }
54842 }
54843 }
54844
54845 if (SDValue R =
54846 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
54847 return R;
54848
54849 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
54850 // to avoid scalarization via legalization because v4i32 is not a legal type.
54851 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
54852 LHS.getValueType() == MVT::v4f32)
54853 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
54854
54855 // X pred 0.0 --> X pred -X
54856 // If the negation of X already exists, use it in the comparison. This removes
54857 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
54858 // instructions in patterns with a 'select' node.
54859 if (isNullFPScalarOrVectorConst(RHS)) {
54860 SDVTList FNegVT = DAG.getVTList(OpVT);
54861 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
54862 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
54863 }
54864
54865 return SDValue();
54866}
54867
54868static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
54869 TargetLowering::DAGCombinerInfo &DCI,
54870 const X86Subtarget &Subtarget) {
54871 SDValue Src = N->getOperand(0);
54872 MVT SrcVT = Src.getSimpleValueType();
54873 MVT VT = N->getSimpleValueType(0);
54874 unsigned NumBits = VT.getScalarSizeInBits();
54875 unsigned NumElts = SrcVT.getVectorNumElements();
54876 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
54877 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54877, __extension__
__PRETTY_FUNCTION__))
;
54878
54879 // Perform constant folding.
54880 APInt UndefElts;
54881 SmallVector<APInt, 32> EltBits;
54882 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
54883 APInt Imm(32, 0);
54884 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
54885 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54886 Imm.setBit(Idx);
54887
54888 return DAG.getConstant(Imm, SDLoc(N), VT);
54889 }
54890
54891 // Look through int->fp bitcasts that don't change the element width.
54892 unsigned EltWidth = SrcVT.getScalarSizeInBits();
54893 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
54894 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
54895 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
54896
54897 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
54898 // with scalar comparisons.
54899 if (SDValue NotSrc = IsNOT(Src, DAG)) {
54900 SDLoc DL(N);
54901 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54902 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
54903 return DAG.getNode(ISD::XOR, DL, VT,
54904 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
54905 DAG.getConstant(NotMask, DL, VT));
54906 }
54907
54908 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
54909 // results with scalar comparisons.
54910 if (Src.getOpcode() == X86ISD::PCMPGT &&
54911 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
54912 SDLoc DL(N);
54913 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54914 return DAG.getNode(ISD::XOR, DL, VT,
54915 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
54916 DAG.getConstant(NotMask, DL, VT));
54917 }
54918
54919 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
54920 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
54921 // iff pow2splat(c1).
54922 // Use KnownBits to determine if only a single bit is non-zero
54923 // in each element (pow2 or zero), and shift that bit to the msb.
54924 if (Src.getOpcode() == X86ISD::PCMPEQ) {
54925 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
54926 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
54927 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
54928 if (KnownLHS.countMaxPopulation() == 1 &&
54929 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
54930 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
54931 SDLoc DL(N);
54932 MVT ShiftVT = SrcVT;
54933 SDValue ShiftLHS = Src.getOperand(0);
54934 SDValue ShiftRHS = Src.getOperand(1);
54935 if (ShiftVT.getScalarType() == MVT::i8) {
54936 // vXi8 shifts - we only care about the signbit so can use PSLLW.
54937 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
54938 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
54939 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
54940 }
54941 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54942 ShiftLHS, ShiftAmt, DAG);
54943 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54944 ShiftRHS, ShiftAmt, DAG);
54945 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
54946 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
54947 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
54948 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
54949 }
54950 }
54951
54952 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54953 if (N->isOnlyUserOf(Src.getNode())) {
54954 SDValue SrcBC = peekThroughOneUseBitcasts(Src);
54955 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
54956 APInt UndefElts;
54957 SmallVector<APInt, 32> EltBits;
54958 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
54959 UndefElts, EltBits)) {
54960 APInt Mask = APInt::getZero(NumBits);
54961 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
54962 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54963 Mask.setBit(Idx);
54964 }
54965 SDLoc DL(N);
54966 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
54967 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
54968 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
54969 DAG.getConstant(Mask, DL, VT));
54970 }
54971 }
54972 }
54973
54974 // Simplify the inputs.
54975 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54976 APInt DemandedMask(APInt::getAllOnes(NumBits));
54977 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54978 return SDValue(N, 0);
54979
54980 return SDValue();
54981}
54982
54983static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,
54984 TargetLowering::DAGCombinerInfo &DCI,
54985 const X86Subtarget &Subtarget) {
54986 MVT VT = N->getSimpleValueType(0);
54987 unsigned NumBits = VT.getScalarSizeInBits();
54988
54989 // Simplify the inputs.
54990 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54991 APInt DemandedMask(APInt::getAllOnes(NumBits));
54992 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54993 return SDValue(N, 0);
54994
54995 return SDValue();
54996}
54997
54998static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
54999 TargetLowering::DAGCombinerInfo &DCI,
55000 const X86Subtarget &Subtarget) {
55001 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
55002 SDValue BasePtr = MemOp->getBasePtr();
55003 SDValue Index = MemOp->getIndex();
55004 SDValue Scale = MemOp->getScale();
55005 SDValue Mask = MemOp->getMask();
55006
55007 // Attempt to fold an index scale into the scale value directly.
55008 // For smaller indices, implicit sext is performed BEFORE scale, preventing
55009 // this fold under most circumstances.
55010 // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
55011 if ((Index.getOpcode() == X86ISD::VSHLI ||
55012 (Index.getOpcode() == ISD::ADD &&
55013 Index.getOperand(0) == Index.getOperand(1))) &&
55014 isa<ConstantSDNode>(Scale) &&
55015 BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
55016 unsigned ShiftAmt =
55017 Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
55018 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
55019 uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
55020 if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
55021 SDValue NewIndex = Index.getOperand(0);
55022 SDValue NewScale =
55023 DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
55024 if (N->getOpcode() == X86ISD::MGATHER)
55025 return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
55026 MemOp->getOperand(1), Mask,
55027 MemOp->getBasePtr(), NewIndex, NewScale,
55028 MemOp->getChain(), Subtarget);
55029 if (N->getOpcode() == X86ISD::MSCATTER)
55030 return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
55031 MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
55032 NewIndex, NewScale, MemOp->getChain(), Subtarget);
55033 }
55034 }
55035
55036 // With vector masks we only demand the upper bit of the mask.
55037 if (Mask.getScalarValueSizeInBits() != 1) {
55038 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55039 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55040 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55041 if (N->getOpcode() != ISD::DELETED_NODE)
55042 DCI.AddToWorklist(N);
55043 return SDValue(N, 0);
55044 }
55045 }
55046
55047 return SDValue();
55048}
55049
55050static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
55051 SDValue Index, SDValue Base, SDValue Scale,
55052 SelectionDAG &DAG) {
55053 SDLoc DL(GorS);
55054
55055 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
55056 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55057 Gather->getMask(), Base, Index, Scale } ;
55058 return DAG.getMaskedGather(Gather->getVTList(),
55059 Gather->getMemoryVT(), DL, Ops,
55060 Gather->getMemOperand(),
55061 Gather->getIndexType(),
55062 Gather->getExtensionType());
55063 }
55064 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
55065 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55066 Scatter->getMask(), Base, Index, Scale };
55067 return DAG.getMaskedScatter(Scatter->getVTList(),
55068 Scatter->getMemoryVT(), DL,
55069 Ops, Scatter->getMemOperand(),
55070 Scatter->getIndexType(),
55071 Scatter->isTruncatingStore());
55072}
55073
55074static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
55075 TargetLowering::DAGCombinerInfo &DCI) {
55076 SDLoc DL(N);
55077 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
55078 SDValue Index = GorS->getIndex();
55079 SDValue Base = GorS->getBasePtr();
55080 SDValue Scale = GorS->getScale();
55081
55082 if (DCI.isBeforeLegalize()) {
55083 unsigned IndexWidth = Index.getScalarValueSizeInBits();
55084
55085 // Shrink constant indices if they are larger than 32-bits.
55086 // Only do this before legalize types since v2i64 could become v2i32.
55087 // FIXME: We could check that the type is legal if we're after legalize
55088 // types, but then we would need to construct test cases where that happens.
55089 // FIXME: We could support more than just constant vectors, but we need to
55090 // careful with costing. A truncate that can be optimized out would be fine.
55091 // Otherwise we might only want to create a truncate if it avoids a split.
55092 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
55093 if (BV->isConstant() && IndexWidth > 32 &&
55094 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55095 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55096 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55097 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55098 }
55099 }
55100
55101 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
55102 // there are sufficient sign bits. Only do this before legalize types to
55103 // avoid creating illegal types in truncate.
55104 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
55105 Index.getOpcode() == ISD::ZERO_EXTEND) &&
55106 IndexWidth > 32 &&
55107 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
55108 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55109 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55110 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55111 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55112 }
55113 }
55114
55115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55116 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
55117 // Try to move splat constant adders from the index operand to the base
55118 // pointer operand. Taking care to multiply by the scale. We can only do
55119 // this when index element type is the same as the pointer type.
55120 // Otherwise we need to be sure the math doesn't wrap before the scale.
55121 if (Index.getOpcode() == ISD::ADD &&
55122 Index.getValueType().getVectorElementType() == PtrVT &&
55123 isa<ConstantSDNode>(Scale)) {
55124 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
55125 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
55126 BitVector UndefElts;
55127 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
55128 // FIXME: Allow non-constant?
55129 if (UndefElts.none()) {
55130 // Apply the scale.
55131 APInt Adder = C->getAPIntValue() * ScaleAmt;
55132 // Add it to the existing base.
55133 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
55134 DAG.getConstant(Adder, DL, PtrVT));
55135 Index = Index.getOperand(0);
55136 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55137 }
55138 }
55139
55140 // It's also possible base is just a constant. In that case, just
55141 // replace it with 0 and move the displacement into the index.
55142 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
55143 isOneConstant(Scale)) {
55144 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
55145 // Combine the constant build_vector and the constant base.
55146 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
55147 Index.getOperand(1), Splat);
55148 // Add to the LHS of the original Index add.
55149 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
55150 Index.getOperand(0), Splat);
55151 Base = DAG.getConstant(0, DL, Base.getValueType());
55152 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55153 }
55154 }
55155 }
55156
55157 if (DCI.isBeforeLegalizeOps()) {
55158 unsigned IndexWidth = Index.getScalarValueSizeInBits();
55159
55160 // Make sure the index is either i32 or i64
55161 if (IndexWidth != 32 && IndexWidth != 64) {
55162 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
55163 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
55164 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
55165 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55166 }
55167 }
55168
55169 // With vector masks we only demand the upper bit of the mask.
55170 SDValue Mask = GorS->getMask();
55171 if (Mask.getScalarValueSizeInBits() != 1) {
55172 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55173 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55174 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55175 if (N->getOpcode() != ISD::DELETED_NODE)
55176 DCI.AddToWorklist(N);
55177 return SDValue(N, 0);
55178 }
55179 }
55180
55181 return SDValue();
55182}
55183
55184// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
55185static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
55186 const X86Subtarget &Subtarget) {
55187 SDLoc DL(N);
55188 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
55189 SDValue EFLAGS = N->getOperand(1);
55190
55191 // Try to simplify the EFLAGS and condition code operands.
55192 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
55193 return getSETCC(CC, Flags, DL, DAG);
55194
55195 return SDValue();
55196}
55197
55198/// Optimize branch condition evaluation.
55199static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
55200 const X86Subtarget &Subtarget) {
55201 SDLoc DL(N);
55202 SDValue EFLAGS = N->getOperand(3);
55203 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
55204
55205 // Try to simplify the EFLAGS and condition code operands.
55206 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
55207 // RAUW them under us.
55208 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
55209 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
55210 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
55211 N->getOperand(1), Cond, Flags);
55212 }
55213
55214 return SDValue();
55215}
55216
55217// TODO: Could we move this to DAGCombine?
55218static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
55219 SelectionDAG &DAG) {
55220 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
55221 // to optimize away operation when it's from a constant.
55222 //
55223 // The general transformation is:
55224 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
55225 // AND(VECTOR_CMP(x,y), constant2)
55226 // constant2 = UNARYOP(constant)
55227
55228 // Early exit if this isn't a vector operation, the operand of the
55229 // unary operation isn't a bitwise AND, or if the sizes of the operations
55230 // aren't the same.
55231 EVT VT = N->getValueType(0);
55232 bool IsStrict = N->isStrictFPOpcode();
55233 unsigned NumEltBits = VT.getScalarSizeInBits();
55234 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55235 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
55236 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
55237 VT.getSizeInBits() != Op0.getValueSizeInBits())
55238 return SDValue();
55239
55240 // Now check that the other operand of the AND is a constant. We could
55241 // make the transformation for non-constant splats as well, but it's unclear
55242 // that would be a benefit as it would not eliminate any operations, just
55243 // perform one more step in scalar code before moving to the vector unit.
55244 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
55245 // Bail out if the vector isn't a constant.
55246 if (!BV->isConstant())
55247 return SDValue();
55248
55249 // Everything checks out. Build up the new and improved node.
55250 SDLoc DL(N);
55251 EVT IntVT = BV->getValueType(0);
55252 // Create a new constant of the appropriate type for the transformed
55253 // DAG.
55254 SDValue SourceConst;
55255 if (IsStrict)
55256 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
55257 {N->getOperand(0), SDValue(BV, 0)});
55258 else
55259 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
55260 // The AND node needs bitcasts to/from an integer vector type around it.
55261 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
55262 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
55263 MaskConst);
55264 SDValue Res = DAG.getBitcast(VT, NewAnd);
55265 if (IsStrict)
55266 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
55267 return Res;
55268 }
55269
55270 return SDValue();
55271}
55272
55273/// If we are converting a value to floating-point, try to replace scalar
55274/// truncate of an extracted vector element with a bitcast. This tries to keep
55275/// the sequence on XMM registers rather than moving between vector and GPRs.
55276static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
55277 // TODO: This is currently only used by combineSIntToFP, but it is generalized
55278 // to allow being called by any similar cast opcode.
55279 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
55280 SDValue Trunc = N->getOperand(0);
55281 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
55282 return SDValue();
55283
55284 SDValue ExtElt = Trunc.getOperand(0);
55285 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55286 !isNullConstant(ExtElt.getOperand(1)))
55287 return SDValue();
55288
55289 EVT TruncVT = Trunc.getValueType();
55290 EVT SrcVT = ExtElt.getValueType();
55291 unsigned DestWidth = TruncVT.getSizeInBits();
55292 unsigned SrcWidth = SrcVT.getSizeInBits();
55293 if (SrcWidth % DestWidth != 0)
55294 return SDValue();
55295
55296 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
55297 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
55298 unsigned VecWidth = SrcVecVT.getSizeInBits();
55299 unsigned NumElts = VecWidth / DestWidth;
55300 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
55301 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
55302 SDLoc DL(N);
55303 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
55304 BitcastVec, ExtElt.getOperand(1));
55305 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
55306}
55307
55308static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
55309 const X86Subtarget &Subtarget) {
55310 bool IsStrict = N->isStrictFPOpcode();
55311 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55312 EVT VT = N->getValueType(0);
55313 EVT InVT = Op0.getValueType();
55314
55315 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
55316 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
55317 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
55318 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55319 unsigned ScalarSize = InVT.getScalarSizeInBits();
55320 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55321 return SDValue();
55322 SDLoc dl(N);
55323 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55324 ScalarSize < 16 ? MVT::i16
55325 : ScalarSize < 32 ? MVT::i32
55326 : MVT::i64,
55327 InVT.getVectorNumElements());
55328 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55329 if (IsStrict)
55330 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
55331 {N->getOperand(0), P});
55332 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
55333 }
55334
55335 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
55336 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
55337 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
55338 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55339 VT.getScalarType() != MVT::f16) {
55340 SDLoc dl(N);
55341 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55342 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55343
55344 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
55345 if (IsStrict)
55346 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55347 {N->getOperand(0), P});
55348 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55349 }
55350
55351 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
55352 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
55353 // the optimization here.
55354 if (DAG.SignBitIsZero(Op0)) {
55355 if (IsStrict)
55356 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
55357 {N->getOperand(0), Op0});
55358 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
55359 }
55360
55361 return SDValue();
55362}
55363
55364static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
55365 TargetLowering::DAGCombinerInfo &DCI,
55366 const X86Subtarget &Subtarget) {
55367 // First try to optimize away the conversion entirely when it's
55368 // conditionally from a constant. Vectors only.
55369 bool IsStrict = N->isStrictFPOpcode();
55370 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
55371 return Res;
55372
55373 // Now move on to more general possibilities.
55374 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55375 EVT VT = N->getValueType(0);
55376 EVT InVT = Op0.getValueType();
55377
55378 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
55379 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
55380 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
55381 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55382 unsigned ScalarSize = InVT.getScalarSizeInBits();
55383 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55384 return SDValue();
55385 SDLoc dl(N);
55386 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55387 ScalarSize < 16 ? MVT::i16
55388 : ScalarSize < 32 ? MVT::i32
55389 : MVT::i64,
55390 InVT.getVectorNumElements());
55391 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55392 if (IsStrict)
55393 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55394 {N->getOperand(0), P});
55395 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55396 }
55397
55398 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
55399 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
55400 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
55401 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55402 VT.getScalarType() != MVT::f16) {
55403 SDLoc dl(N);
55404 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55405 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55406 if (IsStrict)
55407 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55408 {N->getOperand(0), P});
55409 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55410 }
55411
55412 // Without AVX512DQ we only support i64 to float scalar conversion. For both
55413 // vectors and scalars, see if we know that the upper bits are all the sign
55414 // bit, in which case we can truncate the input to i32 and convert from that.
55415 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
55416 unsigned BitWidth = InVT.getScalarSizeInBits();
55417 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
55418 if (NumSignBits >= (BitWidth - 31)) {
55419 EVT TruncVT = MVT::i32;
55420 if (InVT.isVector())
55421 TruncVT = InVT.changeVectorElementType(TruncVT);
55422 SDLoc dl(N);
55423 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
55424 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
55425 if (IsStrict)
55426 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55427 {N->getOperand(0), Trunc});
55428 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
55429 }
55430 // If we're after legalize and the type is v2i32 we need to shuffle and
55431 // use CVTSI2P.
55432 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55432, __extension__
__PRETTY_FUNCTION__))
;
55433 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
55434 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
55435 { 0, 2, -1, -1 });
55436 if (IsStrict)
55437 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
55438 {N->getOperand(0), Shuf});
55439 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
55440 }
55441 }
55442
55443 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
55444 // a 32-bit target where SSE doesn't support i64->FP operations.
55445 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
55446 Op0.getOpcode() == ISD::LOAD) {
55447 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
55448
55449 // This transformation is not supported if the result type is f16 or f128.
55450 if (VT == MVT::f16 || VT == MVT::f128)
55451 return SDValue();
55452
55453 // If we have AVX512DQ we can use packed conversion instructions unless
55454 // the VT is f80.
55455 if (Subtarget.hasDQI() && VT != MVT::f80)
55456 return SDValue();
55457
55458 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
55459 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
55460 std::pair<SDValue, SDValue> Tmp =
55461 Subtarget.getTargetLowering()->BuildFILD(
55462 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
55463 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
55464 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
55465 return Tmp.first;
55466 }
55467 }
55468
55469 if (IsStrict)
55470 return SDValue();
55471
55472 if (SDValue V = combineToFPTruncExtElt(N, DAG))
55473 return V;
55474
55475 return SDValue();
55476}
55477
55478static bool needCarryOrOverflowFlag(SDValue Flags) {
55479 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55479, __extension__
__PRETTY_FUNCTION__))
;
55480
55481 for (const SDNode *User : Flags->uses()) {
55482 X86::CondCode CC;
55483 switch (User->getOpcode()) {
55484 default:
55485 // Be conservative.
55486 return true;
55487 case X86ISD::SETCC:
55488 case X86ISD::SETCC_CARRY:
55489 CC = (X86::CondCode)User->getConstantOperandVal(0);
55490 break;
55491 case X86ISD::BRCOND:
55492 case X86ISD::CMOV:
55493 CC = (X86::CondCode)User->getConstantOperandVal(2);
55494 break;
55495 }
55496
55497 switch (CC) {
55498 default: break;
55499 case X86::COND_A: case X86::COND_AE:
55500 case X86::COND_B: case X86::COND_BE:
55501 case X86::COND_O: case X86::COND_NO:
55502 case X86::COND_G: case X86::COND_GE:
55503 case X86::COND_L: case X86::COND_LE:
55504 return true;
55505 }
55506 }
55507
55508 return false;
55509}
55510
55511static bool onlyZeroFlagUsed(SDValue Flags) {
55512 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55512, __extension__
__PRETTY_FUNCTION__))
;
55513
55514 for (const SDNode *User : Flags->uses()) {
55515 unsigned CCOpNo;
55516 switch (User->getOpcode()) {
55517 default:
55518 // Be conservative.
55519 return false;
55520 case X86ISD::SETCC:
55521 case X86ISD::SETCC_CARRY:
55522 CCOpNo = 0;
55523 break;
55524 case X86ISD::BRCOND:
55525 case X86ISD::CMOV:
55526 CCOpNo = 2;
55527 break;
55528 }
55529
55530 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
55531 if (CC != X86::COND_E && CC != X86::COND_NE)
55532 return false;
55533 }
55534
55535 return true;
55536}
55537
55538static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
55539 // Only handle test patterns.
55540 if (!isNullConstant(N->getOperand(1)))
55541 return SDValue();
55542
55543 // If we have a CMP of a truncated binop, see if we can make a smaller binop
55544 // and use its flags directly.
55545 // TODO: Maybe we should try promoting compares that only use the zero flag
55546 // first if we can prove the upper bits with computeKnownBits?
55547 SDLoc dl(N);
55548 SDValue Op = N->getOperand(0);
55549 EVT VT = Op.getValueType();
55550
55551 // If we have a constant logical shift that's only used in a comparison
55552 // against zero turn it into an equivalent AND. This allows turning it into
55553 // a TEST instruction later.
55554 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
55555 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
55556 onlyZeroFlagUsed(SDValue(N, 0))) {
55557 unsigned BitWidth = VT.getSizeInBits();
55558 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
55559 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
55560 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
55561 APInt Mask = Op.getOpcode() == ISD::SRL
55562 ? APInt::getHighBitsSet(BitWidth, MaskBits)
55563 : APInt::getLowBitsSet(BitWidth, MaskBits);
55564 if (Mask.isSignedIntN(32)) {
55565 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
55566 DAG.getConstant(Mask, dl, VT));
55567 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55568 DAG.getConstant(0, dl, VT));
55569 }
55570 }
55571 }
55572
55573 // Peek through any zero-extend if we're only testing for a zero result.
55574 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
55575 SDValue Src = Op.getOperand(0);
55576 EVT SrcVT = Src.getValueType();
55577 if (SrcVT.getScalarSizeInBits() >= 8 &&
55578 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
55579 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
55580 DAG.getConstant(0, dl, SrcVT));
55581 }
55582
55583 // Look for a truncate.
55584 if (Op.getOpcode() != ISD::TRUNCATE)
55585 return SDValue();
55586
55587 SDValue Trunc = Op;
55588 Op = Op.getOperand(0);
55589
55590 // See if we can compare with zero against the truncation source,
55591 // which should help using the Z flag from many ops. Only do this for
55592 // i32 truncated op to prevent partial-reg compares of promoted ops.
55593 EVT OpVT = Op.getValueType();
55594 APInt UpperBits =
55595 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
55596 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
55597 onlyZeroFlagUsed(SDValue(N, 0))) {
55598 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55599 DAG.getConstant(0, dl, OpVT));
55600 }
55601
55602 // After this the truncate and arithmetic op must have a single use.
55603 if (!Trunc.hasOneUse() || !Op.hasOneUse())
55604 return SDValue();
55605
55606 unsigned NewOpc;
55607 switch (Op.getOpcode()) {
55608 default: return SDValue();
55609 case ISD::AND:
55610 // Skip and with constant. We have special handling for and with immediate
55611 // during isel to generate test instructions.
55612 if (isa<ConstantSDNode>(Op.getOperand(1)))
55613 return SDValue();
55614 NewOpc = X86ISD::AND;
55615 break;
55616 case ISD::OR: NewOpc = X86ISD::OR; break;
55617 case ISD::XOR: NewOpc = X86ISD::XOR; break;
55618 case ISD::ADD:
55619 // If the carry or overflow flag is used, we can't truncate.
55620 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55621 return SDValue();
55622 NewOpc = X86ISD::ADD;
55623 break;
55624 case ISD::SUB:
55625 // If the carry or overflow flag is used, we can't truncate.
55626 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55627 return SDValue();
55628 NewOpc = X86ISD::SUB;
55629 break;
55630 }
55631
55632 // We found an op we can narrow. Truncate its inputs.
55633 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
55634 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
55635
55636 // Use a X86 specific opcode to avoid DAG combine messing with it.
55637 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55638 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
55639
55640 // For AND, keep a CMP so that we can match the test pattern.
55641 if (NewOpc == X86ISD::AND)
55642 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55643 DAG.getConstant(0, dl, VT));
55644
55645 // Return the flags.
55646 return Op.getValue(1);
55647}
55648
55649static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
55650 TargetLowering::DAGCombinerInfo &DCI) {
55651 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__
__PRETTY_FUNCTION__))
55652 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__
__PRETTY_FUNCTION__))
;
55653
55654 SDLoc DL(N);
55655 SDValue LHS = N->getOperand(0);
55656 SDValue RHS = N->getOperand(1);
55657 MVT VT = LHS.getSimpleValueType();
55658 bool IsSub = X86ISD::SUB == N->getOpcode();
55659 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
55660
55661 // If we don't use the flag result, simplify back to a generic ADD/SUB.
55662 if (!N->hasAnyUseOfValue(1)) {
55663 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
55664 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
55665 }
55666
55667 // Fold any similar generic ADD/SUB opcodes to reuse this node.
55668 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
55669 SDValue Ops[] = {N0, N1};
55670 SDVTList VTs = DAG.getVTList(N->getValueType(0));
55671 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
55672 SDValue Op(N, 0);
55673 if (Negate)
55674 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
55675 DCI.CombineTo(GenericAddSub, Op);
55676 }
55677 };
55678 MatchGeneric(LHS, RHS, false);
55679 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
55680
55681 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
55682 // EFLAGS result doesn't change.
55683 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
55684 /*ZeroSecondOpOnly*/ true);
55685}
55686
55687static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
55688 SDValue LHS = N->getOperand(0);
55689 SDValue RHS = N->getOperand(1);
55690 SDValue BorrowIn = N->getOperand(2);
55691
55692 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
55693 MVT VT = N->getSimpleValueType(0);
55694 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55695 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
55696 }
55697
55698 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
55699 // iff the flag result is dead.
55700 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
55701 !N->hasAnyUseOfValue(1))
55702 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55703 LHS.getOperand(1), BorrowIn);
55704
55705 return SDValue();
55706}
55707
55708// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
55709static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
55710 TargetLowering::DAGCombinerInfo &DCI) {
55711 SDValue LHS = N->getOperand(0);
55712 SDValue RHS = N->getOperand(1);
55713 SDValue CarryIn = N->getOperand(2);
55714 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
55715 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
55716
55717 // Canonicalize constant to RHS.
55718 if (LHSC && !RHSC)
55719 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
55720 CarryIn);
55721
55722 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
55723 // the result is either zero or one (depending on the input carry bit).
55724 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
55725 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
55726 // We don't have a good way to replace an EFLAGS use, so only do this when
55727 // dead right now.
55728 SDValue(N, 1).use_empty()) {
55729 SDLoc DL(N);
55730 EVT VT = N->getValueType(0);
55731 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
55732 SDValue Res1 = DAG.getNode(
55733 ISD::AND, DL, VT,
55734 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
55735 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
55736 DAG.getConstant(1, DL, VT));
55737 return DCI.CombineTo(N, Res1, CarryOut);
55738 }
55739
55740 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
55741 // iff the flag result is dead.
55742 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
55743 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
55744 SDLoc DL(N);
55745 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
55746 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
55747 DAG.getConstant(0, DL, LHS.getValueType()),
55748 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
55749 }
55750
55751 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
55752 MVT VT = N->getSimpleValueType(0);
55753 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55754 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
55755 }
55756
55757 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
55758 // iff the flag result is dead.
55759 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
55760 !N->hasAnyUseOfValue(1))
55761 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55762 LHS.getOperand(1), CarryIn);
55763
55764 return SDValue();
55765}
55766
55767static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
55768 const SDLoc &DL, EVT VT,
55769 const X86Subtarget &Subtarget) {
55770 // Example of pattern we try to detect:
55771 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
55772 //(add (build_vector (extract_elt t, 0),
55773 // (extract_elt t, 2),
55774 // (extract_elt t, 4),
55775 // (extract_elt t, 6)),
55776 // (build_vector (extract_elt t, 1),
55777 // (extract_elt t, 3),
55778 // (extract_elt t, 5),
55779 // (extract_elt t, 7)))
55780
55781 if (!Subtarget.hasSSE2())
55782 return SDValue();
55783
55784 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
55785 Op1.getOpcode() != ISD::BUILD_VECTOR)
55786 return SDValue();
55787
55788 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55789 VT.getVectorNumElements() < 4 ||
55790 !isPowerOf2_32(VT.getVectorNumElements()))
55791 return SDValue();
55792
55793 // Check if one of Op0,Op1 is of the form:
55794 // (build_vector (extract_elt Mul, 0),
55795 // (extract_elt Mul, 2),
55796 // (extract_elt Mul, 4),
55797 // ...
55798 // the other is of the form:
55799 // (build_vector (extract_elt Mul, 1),
55800 // (extract_elt Mul, 3),
55801 // (extract_elt Mul, 5),
55802 // ...
55803 // and identify Mul.
55804 SDValue Mul;
55805 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
55806 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
55807 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
55808 // TODO: Be more tolerant to undefs.
55809 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55810 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55811 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55812 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55813 return SDValue();
55814 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
55815 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
55816 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
55817 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
55818 if (!Const0L || !Const1L || !Const0H || !Const1H)
55819 return SDValue();
55820 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
55821 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
55822 // Commutativity of mul allows factors of a product to reorder.
55823 if (Idx0L > Idx1L)
55824 std::swap(Idx0L, Idx1L);
55825 if (Idx0H > Idx1H)
55826 std::swap(Idx0H, Idx1H);
55827 // Commutativity of add allows pairs of factors to reorder.
55828 if (Idx0L > Idx0H) {
55829 std::swap(Idx0L, Idx0H);
55830 std::swap(Idx1L, Idx1H);
55831 }
55832 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
55833 Idx1H != 2 * i + 3)
55834 return SDValue();
55835 if (!Mul) {
55836 // First time an extract_elt's source vector is visited. Must be a MUL
55837 // with 2X number of vector elements than the BUILD_VECTOR.
55838 // Both extracts must be from same MUL.
55839 Mul = Op0L->getOperand(0);
55840 if (Mul->getOpcode() != ISD::MUL ||
55841 Mul.getValueType().getVectorNumElements() != 2 * e)
55842 return SDValue();
55843 }
55844 // Check that the extract is from the same MUL previously seen.
55845 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
55846 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
55847 return SDValue();
55848 }
55849
55850 // Check if the Mul source can be safely shrunk.
55851 ShrinkMode Mode;
55852 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
55853 Mode == ShrinkMode::MULU16)
55854 return SDValue();
55855
55856 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55857 VT.getVectorNumElements() * 2);
55858 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
55859 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
55860
55861 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55862 ArrayRef<SDValue> Ops) {
55863 EVT InVT = Ops[0].getValueType();
55864 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55864, __extension__
__PRETTY_FUNCTION__))
;
55865 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55866 InVT.getVectorNumElements() / 2);
55867 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55868 };
55869 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
55870}
55871
55872// Attempt to turn this pattern into PMADDWD.
55873// (add (mul (sext (build_vector)), (sext (build_vector))),
55874// (mul (sext (build_vector)), (sext (build_vector)))
55875static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
55876 const SDLoc &DL, EVT VT,
55877 const X86Subtarget &Subtarget) {
55878 if (!Subtarget.hasSSE2())
55879 return SDValue();
55880
55881 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
55882 return SDValue();
55883
55884 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55885 VT.getVectorNumElements() < 4 ||
55886 !isPowerOf2_32(VT.getVectorNumElements()))
55887 return SDValue();
55888
55889 SDValue N00 = N0.getOperand(0);
55890 SDValue N01 = N0.getOperand(1);
55891 SDValue N10 = N1.getOperand(0);
55892 SDValue N11 = N1.getOperand(1);
55893
55894 // All inputs need to be sign extends.
55895 // TODO: Support ZERO_EXTEND from known positive?
55896 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
55897 N01.getOpcode() != ISD::SIGN_EXTEND ||
55898 N10.getOpcode() != ISD::SIGN_EXTEND ||
55899 N11.getOpcode() != ISD::SIGN_EXTEND)
55900 return SDValue();
55901
55902 // Peek through the extends.
55903 N00 = N00.getOperand(0);
55904 N01 = N01.getOperand(0);
55905 N10 = N10.getOperand(0);
55906 N11 = N11.getOperand(0);
55907
55908 // Must be extending from vXi16.
55909 EVT InVT = N00.getValueType();
55910 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
55911 N10.getValueType() != InVT || N11.getValueType() != InVT)
55912 return SDValue();
55913
55914 // All inputs should be build_vectors.
55915 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
55916 N01.getOpcode() != ISD::BUILD_VECTOR ||
55917 N10.getOpcode() != ISD::BUILD_VECTOR ||
55918 N11.getOpcode() != ISD::BUILD_VECTOR)
55919 return SDValue();
55920
55921 // For each element, we need to ensure we have an odd element from one vector
55922 // multiplied by the odd element of another vector and the even element from
55923 // one of the same vectors being multiplied by the even element from the
55924 // other vector. So we need to make sure for each element i, this operator
55925 // is being performed:
55926 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
55927 SDValue In0, In1;
55928 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
55929 SDValue N00Elt = N00.getOperand(i);
55930 SDValue N01Elt = N01.getOperand(i);
55931 SDValue N10Elt = N10.getOperand(i);
55932 SDValue N11Elt = N11.getOperand(i);
55933 // TODO: Be more tolerant to undefs.
55934 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55935 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55936 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55937 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55938 return SDValue();
55939 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
55940 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
55941 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
55942 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
55943 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
55944 return SDValue();
55945 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55946 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55947 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55948 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55949 // Add is commutative so indices can be reordered.
55950 if (IdxN00 > IdxN10) {
55951 std::swap(IdxN00, IdxN10);
55952 std::swap(IdxN01, IdxN11);
55953 }
55954 // N0 indices be the even element. N1 indices must be the next odd element.
55955 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
55956 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
55957 return SDValue();
55958 SDValue N00In = N00Elt.getOperand(0);
55959 SDValue N01In = N01Elt.getOperand(0);
55960 SDValue N10In = N10Elt.getOperand(0);
55961 SDValue N11In = N11Elt.getOperand(0);
55962
55963 // First time we find an input capture it.
55964 if (!In0) {
55965 In0 = N00In;
55966 In1 = N01In;
55967
55968 // The input vectors must be at least as wide as the output.
55969 // If they are larger than the output, we extract subvector below.
55970 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
55971 In1.getValueSizeInBits() < VT.getSizeInBits())
55972 return SDValue();
55973 }
55974 // Mul is commutative so the input vectors can be in any order.
55975 // Canonicalize to make the compares easier.
55976 if (In0 != N00In)
55977 std::swap(N00In, N01In);
55978 if (In0 != N10In)
55979 std::swap(N10In, N11In);
55980 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
55981 return SDValue();
55982 }
55983
55984 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55985 ArrayRef<SDValue> Ops) {
55986 EVT OpVT = Ops[0].getValueType();
55987 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__
__PRETTY_FUNCTION__))
55988 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__
__PRETTY_FUNCTION__))
;
55989 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55989, __extension__
__PRETTY_FUNCTION__))
;
55990 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55991 OpVT.getVectorNumElements() / 2);
55992 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55993 };
55994
55995 // If the output is narrower than an input, extract the low part of the input
55996 // vector.
55997 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55998 VT.getVectorNumElements() * 2);
55999 if (OutVT16.bitsLT(In0.getValueType())) {
56000 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
56001 DAG.getIntPtrConstant(0, DL));
56002 }
56003 if (OutVT16.bitsLT(In1.getValueType())) {
56004 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
56005 DAG.getIntPtrConstant(0, DL));
56006 }
56007 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
56008 PMADDBuilder);
56009}
56010
56011// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
56012// If upper element in each pair of both VPMADDWD are zero then we can merge
56013// the operand elements and use the implicit add of VPMADDWD.
56014// TODO: Add support for VPMADDUBSW (which isn't commutable).
56015static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
56016 const SDLoc &DL, EVT VT) {
56017 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
56018 return SDValue();
56019
56020 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
56021 if (VT.getSizeInBits() > 128)
56022 return SDValue();
56023
56024 unsigned NumElts = VT.getVectorNumElements();
56025 MVT OpVT = N0.getOperand(0).getSimpleValueType();
56026 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
56027 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
56028
56029 bool Op0HiZero =
56030 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
56031 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
56032 bool Op1HiZero =
56033 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
56034 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
56035
56036 // TODO: Check for zero lower elements once we have actual codegen that
56037 // creates them.
56038 if (!Op0HiZero || !Op1HiZero)
56039 return SDValue();
56040
56041 // Create a shuffle mask packing the lower elements from each VPMADDWD.
56042 SmallVector<int> Mask;
56043 for (int i = 0; i != (int)NumElts; ++i) {
56044 Mask.push_back(2 * i);
56045 Mask.push_back(2 * (i + NumElts));
56046 }
56047
56048 SDValue LHS =
56049 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
56050 SDValue RHS =
56051 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
56052 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
56053}
56054
56055/// CMOV of constants requires materializing constant operands in registers.
56056/// Try to fold those constants into an 'add' instruction to reduce instruction
56057/// count. We do this with CMOV rather the generic 'select' because there are
56058/// earlier folds that may be used to turn select-of-constants into logic hacks.
56059static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
56060 const X86Subtarget &Subtarget) {
56061 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
56062 // better because we eliminate 1-2 instructions. This transform is still
56063 // an improvement without zero operands because we trade 2 move constants and
56064 // 1 add for 2 adds (LEA) as long as the constants can be represented as
56065 // immediate asm operands (fit in 32-bits).
56066 auto isSuitableCmov = [](SDValue V) {
56067 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
56068 return false;
56069 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
56070 !isa<ConstantSDNode>(V.getOperand(1)))
56071 return false;
56072 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
56073 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
56074 V.getConstantOperandAPInt(1).isSignedIntN(32));
56075 };
56076
56077 // Match an appropriate CMOV as the first operand of the add.
56078 SDValue Cmov = N->getOperand(0);
56079 SDValue OtherOp = N->getOperand(1);
56080 if (!isSuitableCmov(Cmov))
56081 std::swap(Cmov, OtherOp);
56082 if (!isSuitableCmov(Cmov))
56083 return SDValue();
56084
56085 // Don't remove a load folding opportunity for the add. That would neutralize
56086 // any improvements from removing constant materializations.
56087 if (X86::mayFoldLoad(OtherOp, Subtarget))
56088 return SDValue();
56089
56090 EVT VT = N->getValueType(0);
56091 SDLoc DL(N);
56092 SDValue FalseOp = Cmov.getOperand(0);
56093 SDValue TrueOp = Cmov.getOperand(1);
56094
56095 // We will push the add through the select, but we can potentially do better
56096 // if we know there is another add in the sequence and this is pointer math.
56097 // In that case, we can absorb an add into the trailing memory op and avoid
56098 // a 3-operand LEA which is likely slower than a 2-operand LEA.
56099 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
56100 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
56101 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
56102 all_of(N->uses(), [&](SDNode *Use) {
56103 auto *MemNode = dyn_cast<MemSDNode>(Use);
56104 return MemNode && MemNode->getBasePtr().getNode() == N;
56105 })) {
56106 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
56107 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
56108 // it is possible that choosing op1 might be better.
56109 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
56110 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
56111 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
56112 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
56113 Cmov.getOperand(2), Cmov.getOperand(3));
56114 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
56115 }
56116
56117 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
56118 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
56119 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
56120 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
56121 Cmov.getOperand(3));
56122}
56123
56124static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
56125 TargetLowering::DAGCombinerInfo &DCI,
56126 const X86Subtarget &Subtarget) {
56127 EVT VT = N->getValueType(0);
56128 SDValue Op0 = N->getOperand(0);
56129 SDValue Op1 = N->getOperand(1);
56130 SDLoc DL(N);
56131
56132 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
56133 return Select;
56134
56135 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
56136 return MAdd;
56137 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
56138 return MAdd;
56139 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
56140 return MAdd;
56141
56142 // Try to synthesize horizontal adds from adds of shuffles.
56143 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
56144 return V;
56145
56146 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
56147 // (sub Y, (sext (vXi1 X))).
56148 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
56149 // generic DAG combine without a legal type check, but adding this there
56150 // caused regressions.
56151 if (VT.isVector()) {
56152 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56153 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
56154 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56155 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
56156 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
56157 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
56158 }
56159
56160 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
56161 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56162 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
56163 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
56164 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
56165 }
56166 }
56167
56168 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
56169 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
56170 X86::isZeroNode(Op0.getOperand(1))) {
56171 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56171, __extension__
__PRETTY_FUNCTION__))
;
56172 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
56173 Op0.getOperand(0), Op0.getOperand(2));
56174 }
56175
56176 return combineAddOrSubToADCOrSBB(N, DAG);
56177}
56178
56179// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
56180// condition comes from the subtract node that produced -X. This matches the
56181// cmov expansion for absolute value. By swapping the operands we convert abs
56182// to nabs.
56183static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
56184 SDValue N0 = N->getOperand(0);
56185 SDValue N1 = N->getOperand(1);
56186
56187 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
56188 return SDValue();
56189
56190 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
56191 if (CC != X86::COND_S && CC != X86::COND_NS)
56192 return SDValue();
56193
56194 // Condition should come from a negate operation.
56195 SDValue Cond = N1.getOperand(3);
56196 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
56197 return SDValue();
56198 assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56198, __extension__
__PRETTY_FUNCTION__))
;
56199
56200 // Get the X and -X from the negate.
56201 SDValue NegX = Cond.getValue(0);
56202 SDValue X = Cond.getOperand(1);
56203
56204 SDValue FalseOp = N1.getOperand(0);
56205 SDValue TrueOp = N1.getOperand(1);
56206
56207 // Cmov operands should be X and NegX. Order doesn't matter.
56208 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
56209 return SDValue();
56210
56211 // Build a new CMOV with the operands swapped.
56212 SDLoc DL(N);
56213 MVT VT = N->getSimpleValueType(0);
56214 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
56215 N1.getOperand(2), Cond);
56216 // Convert sub to add.
56217 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
56218}
56219
56220static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
56221 SDValue Op0 = N->getOperand(0);
56222 SDValue Op1 = N->getOperand(1);
56223
56224 // (sub C (zero_extend (setcc)))
56225 // =>
56226 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
56227 // Don't disturb (sub 0 setcc), which is easily done with neg.
56228 EVT VT = N->getValueType(0);
56229 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
56230 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
56231 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
56232 Op1.getOperand(0).hasOneUse()) {
56233 SDValue SetCC = Op1.getOperand(0);
56234 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
56235 X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);
56236 uint64_t NewImm = Op0C->getZExtValue() - 1;
56237 SDLoc DL(Op1);
56238 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
56239 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
56240 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
56241 DAG.getConstant(NewImm, DL, VT));
56242 }
56243
56244 return SDValue();
56245}
56246
56247static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
56248 TargetLowering::DAGCombinerInfo &DCI,
56249 const X86Subtarget &Subtarget) {
56250 SDValue Op0 = N->getOperand(0);
56251 SDValue Op1 = N->getOperand(1);
56252
56253 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
56254 auto IsNonOpaqueConstant = [&](SDValue Op) {
56255 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
56256 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
56257 return !Cst->isOpaque();
56258 return true;
56259 }
56260 return false;
56261 };
56262
56263 // X86 can't encode an immediate LHS of a sub. See if we can push the
56264 // negation into a preceding instruction. If the RHS of the sub is a XOR with
56265 // one use and a constant, invert the immediate, saving one register.
56266 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
56267 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
56268 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
56269 SDLoc DL(N);
56270 EVT VT = Op0.getValueType();
56271 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
56272 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
56273 SDValue NewAdd =
56274 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
56275 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
56276 }
56277
56278 if (SDValue V = combineSubABS(N, DAG))
56279 return V;
56280
56281 // Try to synthesize horizontal subs from subs of shuffles.
56282 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
56283 return V;
56284
56285 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
56286 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
56287 X86::isZeroNode(Op1.getOperand(1))) {
56288 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56288, __extension__
__PRETTY_FUNCTION__))
;
56289 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
56290 Op1.getOperand(0), Op1.getOperand(2));
56291 }
56292
56293 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
56294 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
56295 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
56296 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
56297 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56297, __extension__
__PRETTY_FUNCTION__))
;
56298 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
56299 Op1.getOperand(1), Op1.getOperand(2));
56300 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
56301 Op1.getOperand(0));
56302 }
56303
56304 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
56305 return V;
56306
56307 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
56308 return V;
56309
56310 return combineSubSetcc(N, DAG);
56311}
56312
56313static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
56314 const X86Subtarget &Subtarget) {
56315 MVT VT = N->getSimpleValueType(0);
56316 SDLoc DL(N);
56317
56318 if (N->getOperand(0) == N->getOperand(1)) {
56319 if (N->getOpcode() == X86ISD::PCMPEQ)
56320 return DAG.getConstant(-1, DL, VT);
56321 if (N->getOpcode() == X86ISD::PCMPGT)
56322 return DAG.getConstant(0, DL, VT);
56323 }
56324
56325 return SDValue();
56326}
56327
56328/// Helper that combines an array of subvector ops as if they were the operands
56329/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
56330/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
56331static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
56332 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
56333 TargetLowering::DAGCombinerInfo &DCI,
56334 const X86Subtarget &Subtarget) {
56335 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56335, __extension__
__PRETTY_FUNCTION__))
;
56336 unsigned EltSizeInBits = VT.getScalarSizeInBits();
56337
56338 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
56339 return DAG.getUNDEF(VT);
56340
56341 if (llvm::all_of(Ops, [](SDValue Op) {
56342 return ISD::isBuildVectorAllZeros(Op.getNode());
56343 }))
56344 return getZeroVector(VT, Subtarget, DAG, DL);
56345
56346 SDValue Op0 = Ops[0];
56347 bool IsSplat = llvm::all_equal(Ops);
56348
56349 // Repeated subvectors.
56350 if (IsSplat &&
56351 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56352 // If this broadcast is inserted into both halves, use a larger broadcast.
56353 if (Op0.getOpcode() == X86ISD::VBROADCAST)
56354 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
56355
56356 // If this simple subvector or scalar/subvector broadcast_load is inserted
56357 // into both halves, use a larger broadcast_load. Update other uses to use
56358 // an extracted subvector.
56359 if (ISD::isNormalLoad(Op0.getNode()) ||
56360 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
56361 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
56362 auto *Mem = cast<MemSDNode>(Op0);
56363 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
56364 ? X86ISD::VBROADCAST_LOAD
56365 : X86ISD::SUBV_BROADCAST_LOAD;
56366 if (SDValue BcastLd =
56367 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56368 SDValue BcastSrc =
56369 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56370 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56371 return BcastLd;
56372 }
56373 }
56374
56375 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
56376 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
56377 (Subtarget.hasAVX2() ||
56378 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
56379 VT.getScalarType(), Subtarget)))
56380 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
56381 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
56382 Op0.getOperand(0),
56383 DAG.getIntPtrConstant(0, DL)));
56384
56385 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
56386 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
56387 (Subtarget.hasAVX2() ||
56388 (EltSizeInBits >= 32 &&
56389 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
56390 Op0.getOperand(0).getValueType() == VT.getScalarType())
56391 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
56392
56393 // concat_vectors(extract_subvector(broadcast(x)),
56394 // extract_subvector(broadcast(x))) -> broadcast(x)
56395 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56396 Op0.getOperand(0).getValueType() == VT) {
56397 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
56398 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
56399 return Op0.getOperand(0);
56400 }
56401 }
56402
56403 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
56404 // Only concat of subvector high halves which vperm2x128 is best at.
56405 // TODO: This should go in combineX86ShufflesRecursively eventually.
56406 if (VT.is256BitVector() && Ops.size() == 2) {
56407 SDValue Src0 = peekThroughBitcasts(Ops[0]);
56408 SDValue Src1 = peekThroughBitcasts(Ops[1]);
56409 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56410 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
56411 EVT SrcVT0 = Src0.getOperand(0).getValueType();
56412 EVT SrcVT1 = Src1.getOperand(0).getValueType();
56413 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
56414 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
56415 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
56416 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
56417 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
56418 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
56419 DAG.getBitcast(VT, Src0.getOperand(0)),
56420 DAG.getBitcast(VT, Src1.getOperand(0)),
56421 DAG.getTargetConstant(0x31, DL, MVT::i8));
56422 }
56423 }
56424 }
56425
56426 // Repeated opcode.
56427 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
56428 // but it currently struggles with different vector widths.
56429 if (llvm::all_of(Ops, [Op0](SDValue Op) {
56430 return Op.getOpcode() == Op0.getOpcode();
56431 })) {
56432 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
56433 SmallVector<SDValue> Subs;
56434 for (SDValue SubOp : SubOps)
56435 Subs.push_back(SubOp.getOperand(I));
56436 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
56437 };
56438 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
56439 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
56440 SDValue Sub = SubOps[I].getOperand(Op);
56441 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
56442 if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
56443 Sub.getOperand(0).getValueType() != VT ||
56444 Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
56445 return false;
56446 }
56447 return true;
56448 };
56449
56450 unsigned NumOps = Ops.size();
56451 switch (Op0.getOpcode()) {
56452 case X86ISD::VBROADCAST: {
56453 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
56454 return Op.getOperand(0).getValueType().is128BitVector();
56455 })) {
56456 if (VT == MVT::v4f64 || VT == MVT::v4i64)
56457 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
56458 ConcatSubOperand(VT, Ops, 0),
56459 ConcatSubOperand(VT, Ops, 0));
56460 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
56461 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
56462 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
56463 : X86ISD::PSHUFD,
56464 DL, VT, ConcatSubOperand(VT, Ops, 0),
56465 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56466 }
56467 break;
56468 }
56469 case X86ISD::MOVDDUP:
56470 case X86ISD::MOVSHDUP:
56471 case X86ISD::MOVSLDUP: {
56472 if (!IsSplat)
56473 return DAG.getNode(Op0.getOpcode(), DL, VT,
56474 ConcatSubOperand(VT, Ops, 0));
56475 break;
56476 }
56477 case X86ISD::SHUFP: {
56478 // Add SHUFPD support if/when necessary.
56479 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
56480 llvm::all_of(Ops, [Op0](SDValue Op) {
56481 return Op.getOperand(2) == Op0.getOperand(2);
56482 })) {
56483 return DAG.getNode(Op0.getOpcode(), DL, VT,
56484 ConcatSubOperand(VT, Ops, 0),
56485 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56486 }
56487 break;
56488 }
56489 case X86ISD::PSHUFHW:
56490 case X86ISD::PSHUFLW:
56491 case X86ISD::PSHUFD:
56492 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
56493 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
56494 return DAG.getNode(Op0.getOpcode(), DL, VT,
56495 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56496 }
56497 [[fallthrough]];
56498 case X86ISD::VPERMILPI:
56499 if (!IsSplat && VT.getScalarSizeInBits() == 32 &&
56500 (VT.is256BitVector() ||
56501 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56502 all_of(Ops, [&Op0](SDValue Op) {
56503 return Op0.getOperand(1) == Op.getOperand(1);
56504 })) {
56505 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
56506 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
56507 Res =
56508 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
56509 return DAG.getBitcast(VT, Res);
56510 }
56511 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
56512 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
56513 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
56514 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
56515 return DAG.getNode(Op0.getOpcode(), DL, VT,
56516 ConcatSubOperand(VT, Ops, 0),
56517 DAG.getTargetConstant(Idx, DL, MVT::i8));
56518 }
56519 break;
56520 case X86ISD::PSHUFB:
56521 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56522 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56523 return DAG.getNode(Op0.getOpcode(), DL, VT,
56524 ConcatSubOperand(VT, Ops, 0),
56525 ConcatSubOperand(VT, Ops, 1));
56526 }
56527 break;
56528 case X86ISD::VPERMV:
56529 if (!IsSplat && NumOps == 2 &&
56530 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
56531 MVT OpVT = Op0.getSimpleValueType();
56532 int NumSrcElts = OpVT.getVectorNumElements();
56533 SmallVector<int, 64> ConcatMask;
56534 for (unsigned i = 0; i != NumOps; ++i) {
56535 SmallVector<int, 64> SubMask;
56536 SmallVector<SDValue, 2> SubOps;
56537 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56538 SubMask))
56539 break;
56540 for (int M : SubMask) {
56541 if (0 <= M)
56542 M += i * NumSrcElts;
56543 ConcatMask.push_back(M);
56544 }
56545 }
56546 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56547 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
56548 Ops[1].getOperand(1), DAG, DL);
56549 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56550 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56551 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56552 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
56553 }
56554 }
56555 break;
56556 case X86ISD::VPERMV3:
56557 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56558 MVT OpVT = Op0.getSimpleValueType();
56559 int NumSrcElts = OpVT.getVectorNumElements();
56560 SmallVector<int, 64> ConcatMask;
56561 for (unsigned i = 0; i != NumOps; ++i) {
56562 SmallVector<int, 64> SubMask;
56563 SmallVector<SDValue, 2> SubOps;
56564 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56565 SubMask))
56566 break;
56567 for (int M : SubMask) {
56568 if (0 <= M) {
56569 M += M < NumSrcElts ? 0 : NumSrcElts;
56570 M += i * NumSrcElts;
56571 }
56572 ConcatMask.push_back(M);
56573 }
56574 }
56575 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56576 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
56577 Ops[1].getOperand(0), DAG, DL);
56578 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
56579 Ops[1].getOperand(2), DAG, DL);
56580 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56581 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56582 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56583 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
56584 }
56585 }
56586 break;
56587 case ISD::TRUNCATE:
56588 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
56589 EVT SrcVT = Ops[0].getOperand(0).getValueType();
56590 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
56591 SrcVT == Ops[1].getOperand(0).getValueType() &&
56592 Subtarget.useAVX512Regs() &&
56593 Subtarget.getPreferVectorWidth() >= 512 &&
56594 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
56595 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56596 return DAG.getNode(ISD::TRUNCATE, DL, VT,
56597 ConcatSubOperand(NewSrcVT, Ops, 0));
56598 }
56599 }
56600 break;
56601 case X86ISD::VSHLI:
56602 case X86ISD::VSRLI:
56603 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56604 // TODO: Move this to LowerShiftByScalarImmediate?
56605 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
56606 llvm::all_of(Ops, [](SDValue Op) {
56607 return Op.getConstantOperandAPInt(1) == 32;
56608 })) {
56609 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
56610 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
56611 if (Op0.getOpcode() == X86ISD::VSHLI) {
56612 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56613 {8, 0, 8, 2, 8, 4, 8, 6});
56614 } else {
56615 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56616 {1, 8, 3, 8, 5, 8, 7, 8});
56617 }
56618 return DAG.getBitcast(VT, Res);
56619 }
56620 [[fallthrough]];
56621 case X86ISD::VSRAI:
56622 case X86ISD::VSHL:
56623 case X86ISD::VSRL:
56624 case X86ISD::VSRA:
56625 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
56626 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56627 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
56628 llvm::all_of(Ops, [Op0](SDValue Op) {
56629 return Op0.getOperand(1) == Op.getOperand(1);
56630 })) {
56631 return DAG.getNode(Op0.getOpcode(), DL, VT,
56632 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56633 }
56634 break;
56635 case X86ISD::VPERMI:
56636 case X86ISD::VROTLI:
56637 case X86ISD::VROTRI:
56638 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56639 llvm::all_of(Ops, [Op0](SDValue Op) {
56640 return Op0.getOperand(1) == Op.getOperand(1);
56641 })) {
56642 return DAG.getNode(Op0.getOpcode(), DL, VT,
56643 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56644 }
56645 break;
56646 case ISD::AND:
56647 case ISD::OR:
56648 case ISD::XOR:
56649 case X86ISD::ANDNP:
56650 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56651 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56652 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56653 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56654 NumOps * SrcVT.getVectorNumElements());
56655 return DAG.getNode(Op0.getOpcode(), DL, VT,
56656 ConcatSubOperand(SrcVT, Ops, 0),
56657 ConcatSubOperand(SrcVT, Ops, 1));
56658 }
56659 break;
56660 case X86ISD::GF2P8AFFINEQB:
56661 if (!IsSplat &&
56662 (VT.is256BitVector() ||
56663 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56664 llvm::all_of(Ops, [Op0](SDValue Op) {
56665 return Op0.getOperand(2) == Op.getOperand(2);
56666 })) {
56667 return DAG.getNode(Op0.getOpcode(), DL, VT,
56668 ConcatSubOperand(VT, Ops, 0),
56669 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56670 }
56671 break;
56672 case ISD::ADD:
56673 case ISD::SUB:
56674 case ISD::MUL:
56675 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56676 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56677 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
56678 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56679 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56680 NumOps * SrcVT.getVectorNumElements());
56681 return DAG.getNode(Op0.getOpcode(), DL, VT,
56682 ConcatSubOperand(SrcVT, Ops, 0),
56683 ConcatSubOperand(SrcVT, Ops, 1));
56684 }
56685 break;
56686 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
56687 // their latency are short, so here we don't replace them.
56688 case ISD::FDIV:
56689 if (!IsSplat && (VT.is256BitVector() ||
56690 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56691 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56692 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56693 NumOps * SrcVT.getVectorNumElements());
56694 return DAG.getNode(Op0.getOpcode(), DL, VT,
56695 ConcatSubOperand(SrcVT, Ops, 0),
56696 ConcatSubOperand(SrcVT, Ops, 1));
56697 }
56698 break;
56699 case X86ISD::HADD:
56700 case X86ISD::HSUB:
56701 case X86ISD::FHADD:
56702 case X86ISD::FHSUB:
56703 case X86ISD::PACKSS:
56704 case X86ISD::PACKUS:
56705 if (!IsSplat && VT.is256BitVector() &&
56706 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
56707 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56708 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56709 NumOps * SrcVT.getVectorNumElements());
56710 return DAG.getNode(Op0.getOpcode(), DL, VT,
56711 ConcatSubOperand(SrcVT, Ops, 0),
56712 ConcatSubOperand(SrcVT, Ops, 1));
56713 }
56714 break;
56715 case X86ISD::PALIGNR:
56716 if (!IsSplat &&
56717 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56718 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
56719 llvm::all_of(Ops, [Op0](SDValue Op) {
56720 return Op0.getOperand(2) == Op.getOperand(2);
56721 })) {
56722 return DAG.getNode(Op0.getOpcode(), DL, VT,
56723 ConcatSubOperand(VT, Ops, 0),
56724 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56725 }
56726 break;
56727 case ISD::VSELECT:
56728 if (!IsSplat && Subtarget.hasAVX512() &&
56729 (VT.is256BitVector() ||
56730 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56731 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
56732 EVT SelVT = Ops[0].getOperand(0).getValueType();
56733 if (SelVT.getVectorElementType() == MVT::i1) {
56734 SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
56735 Ops.size() * SelVT.getVectorNumElements());
56736 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56737 return DAG.getNode(Op0.getOpcode(), DL, VT,
56738 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56739 ConcatSubOperand(VT, Ops, 1),
56740 ConcatSubOperand(VT, Ops, 2));
56741 }
56742 }
56743 [[fallthrough]];
56744 case X86ISD::BLENDV:
56745 if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
56746 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
56747 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
56748 EVT SelVT = Ops[0].getOperand(0).getValueType();
56749 SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56750 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56751 return DAG.getNode(Op0.getOpcode(), DL, VT,
56752 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56753 ConcatSubOperand(VT, Ops, 1),
56754 ConcatSubOperand(VT, Ops, 2));
56755 }
56756 break;
56757 }
56758 }
56759
56760 // Fold subvector loads into one.
56761 // If needed, look through bitcasts to get to the load.
56762 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
56763 unsigned Fast;
56764 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
56765 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
56766 *FirstLd->getMemOperand(), &Fast) &&
56767 Fast) {
56768 if (SDValue Ld =
56769 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
56770 return Ld;
56771 }
56772 }
56773
56774 // Attempt to fold target constant loads.
56775 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
56776 SmallVector<APInt> EltBits;
56777 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
56778 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56779 APInt OpUndefElts;
56780 SmallVector<APInt> OpEltBits;
56781 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
56782 OpEltBits, true, false))
56783 break;
56784 EltBits.append(OpEltBits);
56785 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
56786 }
56787 if (EltBits.size() == VT.getVectorNumElements())
56788 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
56789 }
56790
56791 return SDValue();
56792}
56793
56794static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
56795 TargetLowering::DAGCombinerInfo &DCI,
56796 const X86Subtarget &Subtarget) {
56797 EVT VT = N->getValueType(0);
56798 EVT SrcVT = N->getOperand(0).getValueType();
56799 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56800 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56801
56802 if (VT.getVectorElementType() == MVT::i1) {
56803 // Attempt to constant fold.
56804 unsigned SubSizeInBits = SrcVT.getSizeInBits();
56805 APInt Constant = APInt::getZero(VT.getSizeInBits());
56806 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56807 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
56808 if (!C) break;
56809 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56810 if (I == (E - 1)) {
56811 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
56812 if (TLI.isTypeLegal(IntVT))
56813 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
56814 }
56815 }
56816
56817 // Don't do anything else for i1 vectors.
56818 return SDValue();
56819 }
56820
56821 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
56822 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
56823 DCI, Subtarget))
56824 return R;
56825 }
56826
56827 return SDValue();
56828}
56829
56830static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
56831 TargetLowering::DAGCombinerInfo &DCI,
56832 const X86Subtarget &Subtarget) {
56833 if (DCI.isBeforeLegalizeOps())
56834 return SDValue();
56835
56836 MVT OpVT = N->getSimpleValueType(0);
56837
56838 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
56839
56840 SDLoc dl(N);
56841 SDValue Vec = N->getOperand(0);
56842 SDValue SubVec = N->getOperand(1);
56843
56844 uint64_t IdxVal = N->getConstantOperandVal(2);
56845 MVT SubVecVT = SubVec.getSimpleValueType();
56846
56847 if (Vec.isUndef() && SubVec.isUndef())
56848 return DAG.getUNDEF(OpVT);
56849
56850 // Inserting undefs/zeros into zeros/undefs is a zero vector.
56851 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
56852 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
56853 return getZeroVector(OpVT, Subtarget, DAG, dl);
56854
56855 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
56856 // If we're inserting into a zero vector and then into a larger zero vector,
56857 // just insert into the larger zero vector directly.
56858 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56859 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
56860 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
56861 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56862 getZeroVector(OpVT, Subtarget, DAG, dl),
56863 SubVec.getOperand(1),
56864 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
56865 }
56866
56867 // If we're inserting into a zero vector and our input was extracted from an
56868 // insert into a zero vector of the same type and the extraction was at
56869 // least as large as the original insertion. Just insert the original
56870 // subvector into a zero vector.
56871 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
56872 isNullConstant(SubVec.getOperand(1)) &&
56873 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
56874 SDValue Ins = SubVec.getOperand(0);
56875 if (isNullConstant(Ins.getOperand(2)) &&
56876 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
56877 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
56878 SubVecVT.getFixedSizeInBits())
56879 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56880 getZeroVector(OpVT, Subtarget, DAG, dl),
56881 Ins.getOperand(1), N->getOperand(2));
56882 }
56883 }
56884
56885 // Stop here if this is an i1 vector.
56886 if (IsI1Vector)
56887 return SDValue();
56888
56889 // Eliminate an intermediate vector widening:
56890 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56891 // insert_subvector X, Y, Idx
56892 // TODO: This is a more general version of a DAGCombiner fold, can we move it
56893 // there?
56894 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56895 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
56896 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
56897 SubVec.getOperand(1), N->getOperand(2));
56898
56899 // If this is an insert of an extract, combine to a shuffle. Don't do this
56900 // if the insert or extract can be represented with a subregister operation.
56901 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56902 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
56903 (IdxVal != 0 ||
56904 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
56905 int ExtIdxVal = SubVec.getConstantOperandVal(1);
56906 if (ExtIdxVal != 0) {
56907 int VecNumElts = OpVT.getVectorNumElements();
56908 int SubVecNumElts = SubVecVT.getVectorNumElements();
56909 SmallVector<int, 64> Mask(VecNumElts);
56910 // First create an identity shuffle mask.
56911 for (int i = 0; i != VecNumElts; ++i)
56912 Mask[i] = i;
56913 // Now insert the extracted portion.
56914 for (int i = 0; i != SubVecNumElts; ++i)
56915 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
56916
56917 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
56918 }
56919 }
56920
56921 // Match concat_vector style patterns.
56922 SmallVector<SDValue, 2> SubVectorOps;
56923 if (collectConcatOps(N, SubVectorOps, DAG)) {
56924 if (SDValue Fold =
56925 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
56926 return Fold;
56927
56928 // If we're inserting all zeros into the upper half, change this to
56929 // a concat with zero. We will match this to a move
56930 // with implicit upper bit zeroing during isel.
56931 // We do this here because we don't want combineConcatVectorOps to
56932 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
56933 if (SubVectorOps.size() == 2 &&
56934 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
56935 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56936 getZeroVector(OpVT, Subtarget, DAG, dl),
56937 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56938 }
56939
56940 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56941 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56942 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56943
56944 // If this is a broadcast load inserted into an upper undef, use a larger
56945 // broadcast load.
56946 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56947 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56948 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56949 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56950 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56951 SDValue BcastLd =
56952 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
56953 MemIntr->getMemoryVT(),
56954 MemIntr->getMemOperand());
56955 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56956 return BcastLd;
56957 }
56958
56959 // If we're splatting the lower half subvector of a full vector load into the
56960 // upper half, attempt to create a subvector broadcast.
56961 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56962 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56963 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56964 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56965 if (VecLd && SubLd &&
56966 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56967 SubVec.getValueSizeInBits() / 8, 0))
56968 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56969 SubLd, 0, DAG);
56970 }
56971
56972 return SDValue();
56973}
56974
56975/// If we are extracting a subvector of a vector select and the select condition
56976/// is composed of concatenated vectors, try to narrow the select width. This
56977/// is a common pattern for AVX1 integer code because 256-bit selects may be
56978/// legal, but there is almost no integer math/logic available for 256-bit.
56979/// This function should only be called with legal types (otherwise, the calls
56980/// to get simple value types will assert).
56981static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
56982 SDValue Sel = Ext->getOperand(0);
56983 SmallVector<SDValue, 4> CatOps;
56984 if (Sel.getOpcode() != ISD::VSELECT ||
56985 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
56986 return SDValue();
56987
56988 // Note: We assume simple value types because this should only be called with
56989 // legal operations/types.
56990 // TODO: This can be extended to handle extraction to 256-bits.
56991 MVT VT = Ext->getSimpleValueType(0);
56992 if (!VT.is128BitVector())
56993 return SDValue();
56994
56995 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56996 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56997 return SDValue();
56998
56999 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
57000 MVT SelVT = Sel.getSimpleValueType();
57001 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__
__PRETTY_FUNCTION__))
57002 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__
__PRETTY_FUNCTION__))
;
57003
57004 unsigned SelElts = SelVT.getVectorNumElements();
57005 unsigned CastedElts = WideVT.getVectorNumElements();
57006 unsigned ExtIdx = Ext->getConstantOperandVal(1);
57007 if (SelElts % CastedElts == 0) {
57008 // The select has the same or more (narrower) elements than the extract
57009 // operand. The extraction index gets scaled by that factor.
57010 ExtIdx *= (SelElts / CastedElts);
57011 } else if (CastedElts % SelElts == 0) {
57012 // The select has less (wider) elements than the extract operand. Make sure
57013 // that the extraction index can be divided evenly.
57014 unsigned IndexDivisor = CastedElts / SelElts;
57015 if (ExtIdx % IndexDivisor != 0)
57016 return SDValue();
57017 ExtIdx /= IndexDivisor;
57018 } else {
57019 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57019)
;
57020 }
57021
57022 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
57023 unsigned NarrowElts = SelElts / NarrowingFactor;
57024 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
57025 SDLoc DL(Ext);
57026 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
57027 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
57028 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
57029 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
57030 return DAG.getBitcast(VT, NarrowSel);
57031}
57032
57033static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
57034 TargetLowering::DAGCombinerInfo &DCI,
57035 const X86Subtarget &Subtarget) {
57036 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
57037 // eventually get combined/lowered into ANDNP) with a concatenated operand,
57038 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
57039 // We let generic combining take over from there to simplify the
57040 // insert/extract and 'not'.
57041 // This pattern emerges during AVX1 legalization. We handle it before lowering
57042 // to avoid complications like splitting constant vector loads.
57043
57044 // Capture the original wide type in the likely case that we need to bitcast
57045 // back to this type.
57046 if (!N->getValueType(0).isSimple())
57047 return SDValue();
57048
57049 MVT VT = N->getSimpleValueType(0);
57050 SDValue InVec = N->getOperand(0);
57051 unsigned IdxVal = N->getConstantOperandVal(1);
57052 SDValue InVecBC = peekThroughBitcasts(InVec);
57053 EVT InVecVT = InVec.getValueType();
57054 unsigned SizeInBits = VT.getSizeInBits();
57055 unsigned InSizeInBits = InVecVT.getSizeInBits();
57056 unsigned NumSubElts = VT.getVectorNumElements();
57057 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57058
57059 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
57060 TLI.isTypeLegal(InVecVT) &&
57061 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
57062 auto isConcatenatedNot = [](SDValue V) {
57063 V = peekThroughBitcasts(V);
57064 if (!isBitwiseNot(V))
57065 return false;
57066 SDValue NotOp = V->getOperand(0);
57067 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
57068 };
57069 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
57070 isConcatenatedNot(InVecBC.getOperand(1))) {
57071 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
57072 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
57073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
57074 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
57075 }
57076 }
57077
57078 if (DCI.isBeforeLegalizeOps())
57079 return SDValue();
57080
57081 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
57082 return V;
57083
57084 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
57085 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
57086
57087 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
57088 if (VT.getScalarType() == MVT::i1)
57089 return DAG.getConstant(1, SDLoc(N), VT);
57090 return getOnesVector(VT, DAG, SDLoc(N));
57091 }
57092
57093 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
57094 return DAG.getBuildVector(VT, SDLoc(N),
57095 InVec->ops().slice(IdxVal, NumSubElts));
57096
57097 // If we are extracting from an insert into a larger vector, replace with a
57098 // smaller insert if we don't access less than the original subvector. Don't
57099 // do this for i1 vectors.
57100 // TODO: Relax the matching indices requirement?
57101 if (VT.getVectorElementType() != MVT::i1 &&
57102 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
57103 IdxVal == InVec.getConstantOperandVal(2) &&
57104 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
57105 SDLoc DL(N);
57106 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
57107 InVec.getOperand(0), N->getOperand(1));
57108 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
57109 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
57110 InVec.getOperand(1),
57111 DAG.getVectorIdxConstant(NewIdxVal, DL));
57112 }
57113
57114 // If we're extracting an upper subvector from a broadcast we should just
57115 // extract the lowest subvector instead which should allow
57116 // SimplifyDemandedVectorElts do more simplifications.
57117 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
57118 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
57119 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
57120 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
57121
57122 // If we're extracting a broadcasted subvector, just use the lowest subvector.
57123 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
57124 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
57125 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
57126
57127 // Attempt to extract from the source of a shuffle vector.
57128 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
57129 SmallVector<int, 32> ShuffleMask;
57130 SmallVector<int, 32> ScaledMask;
57131 SmallVector<SDValue, 2> ShuffleInputs;
57132 unsigned NumSubVecs = InSizeInBits / SizeInBits;
57133 // Decode the shuffle mask and scale it so its shuffling subvectors.
57134 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
57135 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
57136 unsigned SubVecIdx = IdxVal / NumSubElts;
57137 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
57138 return DAG.getUNDEF(VT);
57139 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
57140 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
57141 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
57142 if (Src.getValueSizeInBits() == InSizeInBits) {
57143 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
57144 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
57145 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
57146 SDLoc(N), SizeInBits);
57147 }
57148 }
57149 }
57150
57151 // If we're extracting the lowest subvector and we're the only user,
57152 // we may be able to perform this with a smaller vector width.
57153 unsigned InOpcode = InVec.getOpcode();
57154 if (InVec.hasOneUse()) {
57155 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
57156 // v2f64 CVTDQ2PD(v4i32).
57157 if (InOpcode == ISD::SINT_TO_FP &&
57158 InVec.getOperand(0).getValueType() == MVT::v4i32) {
57159 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
57160 }
57161 // v2f64 CVTUDQ2PD(v4i32).
57162 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
57163 InVec.getOperand(0).getValueType() == MVT::v4i32) {
57164 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
57165 }
57166 // v2f64 CVTPS2PD(v4f32).
57167 if (InOpcode == ISD::FP_EXTEND &&
57168 InVec.getOperand(0).getValueType() == MVT::v4f32) {
57169 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
57170 }
57171 }
57172 if (IdxVal == 0 &&
57173 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
57174 (SizeInBits == 128 || SizeInBits == 256) &&
57175 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
57176 SDLoc DL(N);
57177 SDValue Ext = InVec.getOperand(0);
57178 if (Ext.getValueSizeInBits() > SizeInBits)
57179 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
57180 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
57181 return DAG.getNode(ExtOp, DL, VT, Ext);
57182 }
57183 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
57184 InVec.getOperand(0).getValueType().is256BitVector() &&
57185 InVec.getOperand(1).getValueType().is256BitVector() &&
57186 InVec.getOperand(2).getValueType().is256BitVector()) {
57187 SDLoc DL(N);
57188 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
57189 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
57190 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
57191 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
57192 }
57193 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
57194 (VT.is128BitVector() || VT.is256BitVector())) {
57195 SDLoc DL(N);
57196 SDValue InVecSrc = InVec.getOperand(0);
57197 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
57198 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
57199 return DAG.getNode(InOpcode, DL, VT, Ext);
57200 }
57201 if (InOpcode == X86ISD::MOVDDUP &&
57202 (VT.is128BitVector() || VT.is256BitVector())) {
57203 SDLoc DL(N);
57204 SDValue Ext0 =
57205 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57206 return DAG.getNode(InOpcode, DL, VT, Ext0);
57207 }
57208 }
57209
57210 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
57211 // as this is very likely to fold into a shuffle/truncation.
57212 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
57213 InVecVT.getScalarSizeInBits() == 64 &&
57214 InVec.getConstantOperandAPInt(1) == 32) {
57215 SDLoc DL(N);
57216 SDValue Ext =
57217 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57218 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
57219 }
57220
57221 return SDValue();
57222}
57223
57224static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
57225 EVT VT = N->getValueType(0);
57226 SDValue Src = N->getOperand(0);
57227 SDLoc DL(N);
57228
57229 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
57230 // This occurs frequently in our masked scalar intrinsic code and our
57231 // floating point select lowering with AVX512.
57232 // TODO: SimplifyDemandedBits instead?
57233 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
57234 isOneConstant(Src.getOperand(1)))
57235 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
57236
57237 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
57238 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57239 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
57240 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
57241 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
57242 if (C->isZero())
57243 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
57244 Src.getOperand(1));
57245
57246 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
57247 // TODO: Move to DAGCombine/SimplifyDemandedBits?
57248 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
57249 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
57250 if (Op.getValueType() != MVT::i64)
57251 return SDValue();
57252 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
57253 if (Op.getOpcode() == Opc &&
57254 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
57255 return Op.getOperand(0);
57256 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
57257 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
57258 if (Ld->getExtensionType() == Ext &&
57259 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
57260 return Op;
57261 if (IsZeroExt) {
57262 KnownBits Known = DAG.computeKnownBits(Op);
57263 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
57264 return Op;
57265 }
57266 return SDValue();
57267 };
57268
57269 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
57270 return DAG.getBitcast(
57271 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57272 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
57273
57274 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
57275 return DAG.getBitcast(
57276 VT,
57277 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
57278 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57279 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
57280 }
57281
57282 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
57283 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
57284 Src.getOperand(0).getValueType() == MVT::x86mmx)
57285 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
57286
57287 // See if we're broadcasting the scalar value, in which case just reuse that.
57288 // Ensure the same SDValue from the SDNode use is being used.
57289 if (VT.getScalarType() == Src.getValueType())
57290 for (SDNode *User : Src->uses())
57291 if (User->getOpcode() == X86ISD::VBROADCAST &&
57292 Src == User->getOperand(0)) {
57293 unsigned SizeInBits = VT.getFixedSizeInBits();
57294 unsigned BroadcastSizeInBits =
57295 User->getValueSizeInBits(0).getFixedValue();
57296 if (BroadcastSizeInBits == SizeInBits)
57297 return SDValue(User, 0);
57298 if (BroadcastSizeInBits > SizeInBits)
57299 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
57300 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
57301 // coverage.
57302 }
57303
57304 return SDValue();
57305}
57306
57307// Simplify PMULDQ and PMULUDQ operations.
57308static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
57309 TargetLowering::DAGCombinerInfo &DCI,
57310 const X86Subtarget &Subtarget) {
57311 SDValue LHS = N->getOperand(0);
57312 SDValue RHS = N->getOperand(1);
57313
57314 // Canonicalize constant to RHS.
57315 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
57316 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
57317 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57318
57319 // Multiply by zero.
57320 // Don't return RHS as it may contain UNDEFs.
57321 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
57322 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57323
57324 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
57325 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57326 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
57327 return SDValue(N, 0);
57328
57329 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
57330 // convert it to any_extend_invec, due to the LegalOperations check, do the
57331 // conversion directly to a vector shuffle manually. This exposes combine
57332 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
57333 // combineX86ShufflesRecursively on SSE4.1 targets.
57334 // FIXME: This is basically a hack around several other issues related to
57335 // ANY_EXTEND_VECTOR_INREG.
57336 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57337 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57338 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57339 LHS.getOperand(0).getValueType() == MVT::v4i32) {
57340 SDLoc dl(N);
57341 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
57342 LHS.getOperand(0), { 0, -1, 1, -1 });
57343 LHS = DAG.getBitcast(MVT::v2i64, LHS);
57344 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57345 }
57346 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57347 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57348 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57349 RHS.getOperand(0).getValueType() == MVT::v4i32) {
57350 SDLoc dl(N);
57351 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
57352 RHS.getOperand(0), { 0, -1, 1, -1 });
57353 RHS = DAG.getBitcast(MVT::v2i64, RHS);
57354 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57355 }
57356
57357 return SDValue();
57358}
57359
57360// Simplify VPMADDUBSW/VPMADDWD operations.
57361static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
57362 TargetLowering::DAGCombinerInfo &DCI) {
57363 EVT VT = N->getValueType(0);
57364 SDValue LHS = N->getOperand(0);
57365 SDValue RHS = N->getOperand(1);
57366
57367 // Multiply by zero.
57368 // Don't return LHS/RHS as it may contain UNDEFs.
57369 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
57370 ISD::isBuildVectorAllZeros(RHS.getNode()))
57371 return DAG.getConstant(0, SDLoc(N), VT);
57372
57373 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57374 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57375 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57376 return SDValue(N, 0);
57377
57378 return SDValue();
57379}
57380
57381static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
57382 TargetLowering::DAGCombinerInfo &DCI,
57383 const X86Subtarget &Subtarget) {
57384 EVT VT = N->getValueType(0);
57385 SDValue In = N->getOperand(0);
57386 unsigned Opcode = N->getOpcode();
57387 unsigned InOpcode = In.getOpcode();
57388 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57389 SDLoc DL(N);
57390
57391 // Try to merge vector loads and extend_inreg to an extload.
57392 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
57393 In.hasOneUse()) {
57394 auto *Ld = cast<LoadSDNode>(In);
57395 if (Ld->isSimple()) {
57396 MVT SVT = In.getSimpleValueType().getVectorElementType();
57397 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
57398 ? ISD::SEXTLOAD
57399 : ISD::ZEXTLOAD;
57400 EVT MemVT = VT.changeVectorElementType(SVT);
57401 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
57402 SDValue Load = DAG.getExtLoad(
57403 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57404 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57405 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
57406 return Load;
57407 }
57408 }
57409 }
57410
57411 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57412 if (Opcode == InOpcode)
57413 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
57414
57415 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
57416 // -> EXTEND_VECTOR_INREG(X).
57417 // TODO: Handle non-zero subvector indices.
57418 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
57419 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
57420 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
57421 In.getValueSizeInBits())
57422 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
57423
57424 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57425 // TODO: Move to DAGCombine?
57426 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
57427 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
57428 In.getValueSizeInBits() == VT.getSizeInBits()) {
57429 unsigned NumElts = VT.getVectorNumElements();
57430 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
57431 EVT EltVT = In.getOperand(0).getValueType();
57432 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
57433 for (unsigned I = 0; I != NumElts; ++I)
57434 Elts[I * Scale] = In.getOperand(I);
57435 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
57436 }
57437
57438 // Attempt to combine as a shuffle on SSE41+ targets.
57439 if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
57440 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
57441 Subtarget.hasSSE41()) {
57442 SDValue Op(N, 0);
57443 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
57444 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
57445 return Res;
57446 }
57447
57448 return SDValue();
57449}
57450
57451static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
57452 TargetLowering::DAGCombinerInfo &DCI) {
57453 EVT VT = N->getValueType(0);
57454
57455 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57456 return DAG.getConstant(0, SDLoc(N), VT);
57457
57458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57459 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57460 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57461 return SDValue(N, 0);
57462
57463 return SDValue();
57464}
57465
57466// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
57467// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
57468// extra instructions between the conversion due to going to scalar and back.
57469static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
57470 const X86Subtarget &Subtarget) {
57471 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
57472 return SDValue();
57473
57474 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57475 return SDValue();
57476
57477 if (N->getValueType(0) != MVT::f32 ||
57478 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57479 return SDValue();
57480
57481 SDLoc dl(N);
57482 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
57483 N->getOperand(0).getOperand(0));
57484 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
57485 DAG.getTargetConstant(4, dl, MVT::i32));
57486 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
57487 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
57488 DAG.getIntPtrConstant(0, dl));
57489}
57490
57491static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
57492 const X86Subtarget &Subtarget) {
57493 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57494 return SDValue();
57495
57496 if (Subtarget.hasFP16())
57497 return SDValue();
57498
57499 bool IsStrict = N->isStrictFPOpcode();
57500 EVT VT = N->getValueType(0);
57501 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57502 EVT SrcVT = Src.getValueType();
57503
57504 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
57505 return SDValue();
57506
57507 if (VT.getVectorElementType() != MVT::f32 &&
57508 VT.getVectorElementType() != MVT::f64)
57509 return SDValue();
57510
57511 unsigned NumElts = VT.getVectorNumElements();
57512 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57513 return SDValue();
57514
57515 SDLoc dl(N);
57516
57517 // Convert the input to vXi16.
57518 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
57519 Src = DAG.getBitcast(IntVT, Src);
57520
57521 // Widen to at least 8 input elements.
57522 if (NumElts < 8) {
57523 unsigned NumConcats = 8 / NumElts;
57524 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
57525 : DAG.getConstant(0, dl, IntVT);
57526 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
57527 Ops[0] = Src;
57528 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
57529 }
57530
57531 // Destination is vXf32 with at least 4 elements.
57532 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
57533 std::max(4U, NumElts));
57534 SDValue Cvt, Chain;
57535 if (IsStrict) {
57536 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
57537 {N->getOperand(0), Src});
57538 Chain = Cvt.getValue(1);
57539 } else {
57540 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
57541 }
57542
57543 if (NumElts < 4) {
57544 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57544, __extension__
__PRETTY_FUNCTION__))
;
57545 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
57546 DAG.getIntPtrConstant(0, dl));
57547 }
57548
57549 if (IsStrict) {
57550 // Extend to the original VT if necessary.
57551 if (Cvt.getValueType() != VT) {
57552 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
57553 {Chain, Cvt});
57554 Chain = Cvt.getValue(1);
57555 }
57556 return DAG.getMergeValues({Cvt, Chain}, dl);
57557 }
57558
57559 // Extend to the original VT if necessary.
57560 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
57561}
57562
57563// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
57564// from. Limit this to cases where the loads have the same input chain and the
57565// output chains are unused. This avoids any memory ordering issues.
57566static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
57567 TargetLowering::DAGCombinerInfo &DCI) {
57568 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))
57569 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))
57570 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))
;
57571
57572 // Only do this if the chain result is unused.
57573 if (N->hasAnyUseOfValue(1))
57574 return SDValue();
57575
57576 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
57577
57578 SDValue Ptr = MemIntrin->getBasePtr();
57579 SDValue Chain = MemIntrin->getChain();
57580 EVT VT = N->getSimpleValueType(0);
57581 EVT MemVT = MemIntrin->getMemoryVT();
57582
57583 // Look at other users of our base pointer and try to find a wider broadcast.
57584 // The input chain and the size of the memory VT must match.
57585 for (SDNode *User : Ptr->uses())
57586 if (User != N && User->getOpcode() == N->getOpcode() &&
57587 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57588 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57589 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57590 MemVT.getSizeInBits() &&
57591 !User->hasAnyUseOfValue(1) &&
57592 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57593 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
57594 VT.getSizeInBits());
57595 Extract = DAG.getBitcast(VT, Extract);
57596 return DCI.CombineTo(N, Extract, SDValue(User, 1));
57597 }
57598
57599 return SDValue();
57600}
57601
57602static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
57603 const X86Subtarget &Subtarget) {
57604 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57605 return SDValue();
57606
57607 bool IsStrict = N->isStrictFPOpcode();
57608 EVT VT = N->getValueType(0);
57609 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57610 EVT SrcVT = Src.getValueType();
57611
57612 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
57613 SrcVT.getVectorElementType() != MVT::f32)
57614 return SDValue();
57615
57616 SDLoc dl(N);
57617
57618 SDValue Cvt, Chain;
57619 unsigned NumElts = VT.getVectorNumElements();
57620 if (Subtarget.hasFP16()) {
57621 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
57622 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
57623 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
57624 SDValue Cvt0, Cvt1;
57625 SDValue Op0 = Src.getOperand(0);
57626 SDValue Op1 = Src.getOperand(1);
57627 bool IsOp0Strict = Op0->isStrictFPOpcode();
57628 if (Op0.getOpcode() != Op1.getOpcode() ||
57629 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
57630 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
57631 return SDValue();
57632 }
57633 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
57634 if (IsStrict) {
57635 assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57635, __extension__
__PRETTY_FUNCTION__))
;
57636 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
57637 ? X86ISD::STRICT_CVTSI2P
57638 : X86ISD::STRICT_CVTUI2P;
57639 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57640 {Op0.getOperand(0), Op0.getOperand(1)});
57641 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57642 {Op1.getOperand(0), Op1.getOperand(1)});
57643 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57644 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
57645 }
57646 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
57647 : X86ISD::CVTUI2P;
57648 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
57649 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
57650 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57651 }
57652 return SDValue();
57653 }
57654
57655 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57656 return SDValue();
57657
57658 // Widen to at least 4 input elements.
57659 if (NumElts < 4)
57660 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
57661 DAG.getConstantFP(0.0, dl, SrcVT));
57662
57663 // Destination is v8i16 with at least 8 elements.
57664 EVT CvtVT =
57665 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
57666 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
57667 if (IsStrict) {
57668 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
57669 {N->getOperand(0), Src, Rnd});
57670 Chain = Cvt.getValue(1);
57671 } else {
57672 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
57673 }
57674
57675 // Extract down to real number of elements.
57676 if (NumElts < 8) {
57677 EVT IntVT = VT.changeVectorElementTypeToInteger();
57678 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
57679 DAG.getIntPtrConstant(0, dl));
57680 }
57681
57682 Cvt = DAG.getBitcast(VT, Cvt);
57683
57684 if (IsStrict)
57685 return DAG.getMergeValues({Cvt, Chain}, dl);
57686
57687 return Cvt;
57688}
57689
57690static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
57691 SDValue Src = N->getOperand(0);
57692
57693 // Turn MOVDQ2Q+simple_load into an mmx load.
57694 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
57695 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
57696
57697 if (LN->isSimple()) {
57698 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
57699 LN->getBasePtr(),
57700 LN->getPointerInfo(),
57701 LN->getOriginalAlign(),
57702 LN->getMemOperand()->getFlags());
57703 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
57704 return NewLd;
57705 }
57706 }
57707
57708 return SDValue();
57709}
57710
57711static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
57712 TargetLowering::DAGCombinerInfo &DCI) {
57713 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
57714 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57715 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
57716 return SDValue(N, 0);
57717
57718 return SDValue();
57719}
57720
57721SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
57722 DAGCombinerInfo &DCI) const {
57723 SelectionDAG &DAG = DCI.DAG;
57724 switch (N->getOpcode()) {
57725 default: break;
57726 case ISD::SCALAR_TO_VECTOR:
57727 return combineScalarToVector(N, DAG);
57728 case ISD::EXTRACT_VECTOR_ELT:
57729 case X86ISD::PEXTRW:
57730 case X86ISD::PEXTRB:
57731 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
57732 case ISD::CONCAT_VECTORS:
57733 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
57734 case ISD::INSERT_SUBVECTOR:
57735 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
57736 case ISD::EXTRACT_SUBVECTOR:
57737 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
57738 case ISD::VSELECT:
57739 case ISD::SELECT:
57740 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
57741 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
57742 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
57743 case X86ISD::CMP: return combineCMP(N, DAG);
57744 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
57745 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
57746 case X86ISD::ADD:
57747 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
57748 case X86ISD::SBB: return combineSBB(N, DAG);
57749 case X86ISD::ADC: return combineADC(N, DAG, DCI);
57750 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
57751 case ISD::SHL: return combineShiftLeft(N, DAG);
57752 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
57753 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
57754 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
57755 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
57756 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
57757 case X86ISD::BEXTR:
57758 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
57759 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
57760 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
57761 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
57762 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
57763 case X86ISD::VEXTRACT_STORE:
57764 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
57765 case ISD::SINT_TO_FP:
57766 case ISD::STRICT_SINT_TO_FP:
57767 return combineSIntToFP(N, DAG, DCI, Subtarget);
57768 case ISD::UINT_TO_FP:
57769 case ISD::STRICT_UINT_TO_FP:
57770 return combineUIntToFP(N, DAG, Subtarget);
57771 case ISD::FADD:
57772 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
57773 case X86ISD::VFCMULC:
57774 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
57775 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
57776 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
57777 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
57778 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
57779 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
57780 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
57781 case X86ISD::FXOR:
57782 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
57783 case X86ISD::FMIN:
57784 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
57785 case ISD::FMINNUM:
57786 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
57787 case X86ISD::CVTSI2P:
57788 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
57789 case X86ISD::CVTP2SI:
57790 case X86ISD::CVTP2UI:
57791 case X86ISD::STRICT_CVTTP2SI:
57792 case X86ISD::CVTTP2SI:
57793 case X86ISD::STRICT_CVTTP2UI:
57794 case X86ISD::CVTTP2UI:
57795 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
57796 case X86ISD::STRICT_CVTPH2PS:
57797 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
57798 case X86ISD::BT: return combineBT(N, DAG, DCI);
57799 case ISD::ANY_EXTEND:
57800 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
57801 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
57802 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
57803 case ISD::ANY_EXTEND_VECTOR_INREG:
57804 case ISD::SIGN_EXTEND_VECTOR_INREG:
57805 case ISD::ZERO_EXTEND_VECTOR_INREG:
57806 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
57807 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
57808 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
57809 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
57810 case X86ISD::PACKSS:
57811 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
57812 case X86ISD::HADD:
57813 case X86ISD::HSUB:
57814 case X86ISD::FHADD:
57815 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
57816 case X86ISD::VSHL:
57817 case X86ISD::VSRA:
57818 case X86ISD::VSRL:
57819 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
57820 case X86ISD::VSHLI:
57821 case X86ISD::VSRAI:
57822 case X86ISD::VSRLI:
57823 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
57824 case ISD::INSERT_VECTOR_ELT:
57825 case X86ISD::PINSRB:
57826 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
57827 case X86ISD::SHUFP: // Handle all target specific shuffles
57828 case X86ISD::INSERTPS:
57829 case X86ISD::EXTRQI:
57830 case X86ISD::INSERTQI:
57831 case X86ISD::VALIGN:
57832 case X86ISD::PALIGNR:
57833 case X86ISD::VSHLDQ:
57834 case X86ISD::VSRLDQ:
57835 case X86ISD::BLENDI:
57836 case X86ISD::UNPCKH:
57837 case X86ISD::UNPCKL:
57838 case X86ISD::MOVHLPS:
57839 case X86ISD::MOVLHPS:
57840 case X86ISD::PSHUFB:
57841 case X86ISD::PSHUFD:
57842 case X86ISD::PSHUFHW:
57843 case X86ISD::PSHUFLW:
57844 case X86ISD::MOVSHDUP:
57845 case X86ISD::MOVSLDUP:
57846 case X86ISD::MOVDDUP:
57847 case X86ISD::MOVSS:
57848 case X86ISD::MOVSD:
57849 case X86ISD::MOVSH:
57850 case X86ISD::VBROADCAST:
57851 case X86ISD::VPPERM:
57852 case X86ISD::VPERMI:
57853 case X86ISD::VPERMV:
57854 case X86ISD::VPERMV3:
57855 case X86ISD::VPERMIL2:
57856 case X86ISD::VPERMILPI:
57857 case X86ISD::VPERMILPV:
57858 case X86ISD::VPERM2X128:
57859 case X86ISD::SHUF128:
57860 case X86ISD::VZEXT_MOVL:
57861 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
57862 case X86ISD::FMADD_RND:
57863 case X86ISD::FMSUB:
57864 case X86ISD::STRICT_FMSUB:
57865 case X86ISD::FMSUB_RND:
57866 case X86ISD::FNMADD:
57867 case X86ISD::STRICT_FNMADD:
57868 case X86ISD::FNMADD_RND:
57869 case X86ISD::FNMSUB:
57870 case X86ISD::STRICT_FNMSUB:
57871 case X86ISD::FNMSUB_RND:
57872 case ISD::FMA:
57873 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
57874 case X86ISD::FMADDSUB_RND:
57875 case X86ISD::FMSUBADD_RND:
57876 case X86ISD::FMADDSUB:
57877 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
57878 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
57879 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
57880 case X86ISD::MGATHER:
57881 case X86ISD::MSCATTER:
57882 return combineX86GatherScatter(N, DAG, DCI, Subtarget);
57883 case ISD::MGATHER:
57884 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57885 case X86ISD::PCMPEQ:
57886 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57887 case X86ISD::PMULDQ:
57888 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57889 case X86ISD::VPMADDUBSW:
57890 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57891 case X86ISD::KSHIFTL:
57892 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57893 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57894 case ISD::STRICT_FP_EXTEND:
57895 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
57896 case ISD::STRICT_FP_ROUND:
57897 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57898 case X86ISD::VBROADCAST_LOAD:
57899 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57900 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57901 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57902 }
57903
57904 return SDValue();
57905}
57906
57907bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
57908 return false;
57909}
57910
57911bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57912 if (!isTypeLegal(VT))
57913 return false;
57914
57915 // There are no vXi8 shifts.
57916 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57917 return false;
57918
57919 // TODO: Almost no 8-bit ops are desirable because they have no actual
57920 // size/speed advantages vs. 32-bit ops, but they do have a major
57921 // potential disadvantage by causing partial register stalls.
57922 //
57923 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57924 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57925 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57926 // check for a constant operand to the multiply.
57927 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57928 return false;
57929
57930 // i16 instruction encodings are longer and some i16 instructions are slow,
57931 // so those are not desirable.
57932 if (VT == MVT::i16) {
57933 switch (Opc) {
57934 default:
57935 break;
57936 case ISD::LOAD:
57937 case ISD::SIGN_EXTEND:
57938 case ISD::ZERO_EXTEND:
57939 case ISD::ANY_EXTEND:
57940 case ISD::SHL:
57941 case ISD::SRA:
57942 case ISD::SRL:
57943 case ISD::SUB:
57944 case ISD::ADD:
57945 case ISD::MUL:
57946 case ISD::AND:
57947 case ISD::OR:
57948 case ISD::XOR:
57949 return false;
57950 }
57951 }
57952
57953 // Any legal type not explicitly accounted for above here is desirable.
57954 return true;
57955}
57956
57957SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
57958 SDValue Value, SDValue Addr,
57959 SelectionDAG &DAG) const {
57960 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57961 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57962 if (IsCFProtectionSupported) {
57963 // In case control-flow branch protection is enabled, we need to add
57964 // notrack prefix to the indirect branch.
57965 // In order to do that we create NT_BRIND SDNode.
57966 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57967 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
57968 }
57969
57970 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
57971}
57972
57973TargetLowering::AndOrSETCCFoldKind
57974X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
57975 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57976 using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
57977 EVT VT = LogicOp->getValueType(0);
57978 EVT OpVT = SETCC0->getOperand(0).getValueType();
57979 if (!VT.isInteger())
57980 return AndOrSETCCFoldKind::None;
57981
57982 if (VT.isVector())
57983 return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
57984 (isOperationLegal(ISD::ABS, OpVT)
57985 ? AndOrSETCCFoldKind::ABS
57986 : AndOrSETCCFoldKind::None));
57987
57988 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57989 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57990 // `NotAnd` applies, `AddAnd` does as well.
57991 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57992 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57993 return AndOrSETCCFoldKind::AddAnd;
57994}
57995
57996bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
57997 EVT VT = Op.getValueType();
57998 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57999 isa<ConstantSDNode>(Op.getOperand(1));
58000
58001 // i16 is legal, but undesirable since i16 instruction encodings are longer
58002 // and some i16 instructions are slow.
58003 // 8-bit multiply-by-constant can usually be expanded to something cheaper
58004 // using LEA and/or other ALU ops.
58005 if (VT != MVT::i16 && !Is8BitMulByConstant)
58006 return false;
58007
58008 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
58009 if (!Op.hasOneUse())
58010 return false;
58011 SDNode *User = *Op->use_begin();
58012 if (!ISD::isNormalStore(User))
58013 return false;
58014 auto *Ld = cast<LoadSDNode>(Load);
58015 auto *St = cast<StoreSDNode>(User);
58016 return Ld->getBasePtr() == St->getBasePtr();
58017 };
58018
58019 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
58020 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
58021 return false;
58022 if (!Op.hasOneUse())
58023 return false;
58024 SDNode *User = *Op->use_begin();
58025 if (User->getOpcode() != ISD::ATOMIC_STORE)
58026 return false;
58027 auto *Ld = cast<AtomicSDNode>(Load);
58028 auto *St = cast<AtomicSDNode>(User);
58029 return Ld->getBasePtr() == St->getBasePtr();
58030 };
58031
58032 bool Commute = false;
58033 switch (Op.getOpcode()) {
58034 default: return false;
58035 case ISD::SIGN_EXTEND:
58036 case ISD::ZERO_EXTEND:
58037 case ISD::ANY_EXTEND:
58038 break;
58039 case ISD::SHL:
58040 case ISD::SRA:
58041 case ISD::SRL: {
58042 SDValue N0 = Op.getOperand(0);
58043 // Look out for (store (shl (load), x)).
58044 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
58045 return false;
58046 break;
58047 }
58048 case ISD::ADD:
58049 case ISD::MUL:
58050 case ISD::AND:
58051 case ISD::OR:
58052 case ISD::XOR:
58053 Commute = true;
58054 [[fallthrough]];
58055 case ISD::SUB: {
58056 SDValue N0 = Op.getOperand(0);
58057 SDValue N1 = Op.getOperand(1);
58058 // Avoid disabling potential load folding opportunities.
58059 if (X86::mayFoldLoad(N1, Subtarget) &&
58060 (!Commute || !isa<ConstantSDNode>(N0) ||
58061 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
58062 return false;
58063 if (X86::mayFoldLoad(N0, Subtarget) &&
58064 ((Commute && !isa<ConstantSDNode>(N1)) ||
58065 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
58066 return false;
58067 if (IsFoldableAtomicRMW(N0, Op) ||
58068 (Commute && IsFoldableAtomicRMW(N1, Op)))
58069 return false;
58070 }
58071 }
58072
58073 PVT = MVT::i32;
58074 return true;
58075}
58076
58077//===----------------------------------------------------------------------===//
58078// X86 Inline Assembly Support
58079//===----------------------------------------------------------------------===//
58080
58081// Helper to match a string separated by whitespace.
58082static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
58083 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
58084
58085 for (StringRef Piece : Pieces) {
58086 if (!S.startswith(Piece)) // Check if the piece matches.
58087 return false;
58088
58089 S = S.substr(Piece.size());
58090 StringRef::size_type Pos = S.find_first_not_of(" \t");
58091 if (Pos == 0) // We matched a prefix.
58092 return false;
58093
58094 S = S.substr(Pos);
58095 }
58096
58097 return S.empty();
58098}
58099
58100static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
58101
58102 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
58103 if (llvm::is_contained(AsmPieces, "~{cc}") &&
58104 llvm::is_contained(AsmPieces, "~{flags}") &&
58105 llvm::is_contained(AsmPieces, "~{fpsr}")) {
58106
58107 if (AsmPieces.size() == 3)
58108 return true;
58109 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
58110 return true;
58111 }
58112 }
58113 return false;
58114}
58115
58116bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
58117 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
58118
58119 const std::string &AsmStr = IA->getAsmString();
58120
58121 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
58122 if (!Ty || Ty->getBitWidth() % 16 != 0)
58123 return false;
58124
58125 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
58126 SmallVector<StringRef, 4> AsmPieces;
58127 SplitString(AsmStr, AsmPieces, ";\n");
58128
58129 switch (AsmPieces.size()) {
58130 default: return false;
58131 case 1:
58132 // FIXME: this should verify that we are targeting a 486 or better. If not,
58133 // we will turn this bswap into something that will be lowered to logical
58134 // ops instead of emitting the bswap asm. For now, we don't support 486 or
58135 // lower so don't worry about this.
58136 // bswap $0
58137 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
58138 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
58139 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
58140 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
58141 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
58142 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
58143 // No need to check constraints, nothing other than the equivalent of
58144 // "=r,0" would be valid here.
58145 return IntrinsicLowering::LowerToByteSwap(CI);
58146 }
58147
58148 // rorw $$8, ${0:w} --> llvm.bswap.i16
58149 if (CI->getType()->isIntegerTy(16) &&
58150 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58151 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
58152 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
58153 AsmPieces.clear();
58154 StringRef ConstraintsStr = IA->getConstraintString();
58155 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58156 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58157 if (clobbersFlagRegisters(AsmPieces))
58158 return IntrinsicLowering::LowerToByteSwap(CI);
58159 }
58160 break;
58161 case 3:
58162 if (CI->getType()->isIntegerTy(32) &&
58163 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58164 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
58165 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
58166 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
58167 AsmPieces.clear();
58168 StringRef ConstraintsStr = IA->getConstraintString();
58169 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58170 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58171 if (clobbersFlagRegisters(AsmPieces))
58172 return IntrinsicLowering::LowerToByteSwap(CI);
58173 }
58174
58175 if (CI->getType()->isIntegerTy(64)) {
58176 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
58177 if (Constraints.size() >= 2 &&
58178 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
58179 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
58180 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
58181 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
58182 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
58183 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
58184 return IntrinsicLowering::LowerToByteSwap(CI);
58185 }
58186 }
58187 break;
58188 }
58189 return false;
58190}
58191
58192static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
58193 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
58194 .Case("{@cca}", X86::COND_A)
58195 .Case("{@ccae}", X86::COND_AE)
58196 .Case("{@ccb}", X86::COND_B)
58197 .Case("{@ccbe}", X86::COND_BE)
58198 .Case("{@ccc}", X86::COND_B)
58199 .Case("{@cce}", X86::COND_E)
58200 .Case("{@ccz}", X86::COND_E)
58201 .Case("{@ccg}", X86::COND_G)
58202 .Case("{@ccge}", X86::COND_GE)
58203 .Case("{@ccl}", X86::COND_L)
58204 .Case("{@ccle}", X86::COND_LE)
58205 .Case("{@ccna}", X86::COND_BE)
58206 .Case("{@ccnae}", X86::COND_B)
58207 .Case("{@ccnb}", X86::COND_AE)
58208 .Case("{@ccnbe}", X86::COND_A)
58209 .Case("{@ccnc}", X86::COND_AE)
58210 .Case("{@ccne}", X86::COND_NE)
58211 .Case("{@ccnz}", X86::COND_NE)
58212 .Case("{@ccng}", X86::COND_LE)
58213 .Case("{@ccnge}", X86::COND_L)
58214 .Case("{@ccnl}", X86::COND_GE)
58215 .Case("{@ccnle}", X86::COND_G)
58216 .Case("{@ccno}", X86::COND_NO)
58217 .Case("{@ccnp}", X86::COND_NP)
58218 .Case("{@ccns}", X86::COND_NS)
58219 .Case("{@cco}", X86::COND_O)
58220 .Case("{@ccp}", X86::COND_P)
58221 .Case("{@ccs}", X86::COND_S)
58222 .Default(X86::COND_INVALID);
58223 return Cond;
58224}
58225
58226/// Given a constraint letter, return the type of constraint for this target.
58227X86TargetLowering::ConstraintType
58228X86TargetLowering::getConstraintType(StringRef Constraint) const {
58229 if (Constraint.size() == 1) {
58230 switch (Constraint[0]) {
58231 case 'R':
58232 case 'q':
58233 case 'Q':
58234 case 'f':
58235 case 't':
58236 case 'u':
58237 case 'y':
58238 case 'x':
58239 case 'v':
58240 case 'l':
58241 case 'k': // AVX512 masking registers.
58242 return C_RegisterClass;
58243 case 'a':
58244 case 'b':
58245 case 'c':
58246 case 'd':
58247 case 'S':
58248 case 'D':
58249 case 'A':
58250 return C_Register;
58251 case 'I':
58252 case 'J':
58253 case 'K':
58254 case 'N':
58255 case 'G':
58256 case 'L':
58257 case 'M':
58258 return C_Immediate;
58259 case 'C':
58260 case 'e':
58261 case 'Z':
58262 return C_Other;
58263 default:
58264 break;
58265 }
58266 }
58267 else if (Constraint.size() == 2) {
58268 switch (Constraint[0]) {
58269 default:
58270 break;
58271 case 'Y':
58272 switch (Constraint[1]) {
58273 default:
58274 break;
58275 case 'z':
58276 return C_Register;
58277 case 'i':
58278 case 'm':
58279 case 'k':
58280 case 't':
58281 case '2':
58282 return C_RegisterClass;
58283 }
58284 }
58285 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58286 return C_Other;
58287 return TargetLowering::getConstraintType(Constraint);
58288}
58289
58290/// Examine constraint type and operand type and determine a weight value.
58291/// This object must already have been set up with the operand type
58292/// and the current alternative constraint selected.
58293TargetLowering::ConstraintWeight
58294 X86TargetLowering::getSingleConstraintMatchWeight(
58295 AsmOperandInfo &info, const char *constraint) const {
58296 ConstraintWeight weight = CW_Invalid;
58297 Value *CallOperandVal = info.CallOperandVal;
58298 // If we don't have a value, we can't do a match,
58299 // but allow it at the lowest weight.
58300 if (!CallOperandVal)
58301 return CW_Default;
58302 Type *type = CallOperandVal->getType();
58303 // Look at the constraint type.
58304 switch (*constraint) {
58305 default:
58306 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
58307 [[fallthrough]];
58308 case 'R':
58309 case 'q':
58310 case 'Q':
58311 case 'a':
58312 case 'b':
58313 case 'c':
58314 case 'd':
58315 case 'S':
58316 case 'D':
58317 case 'A':
58318 if (CallOperandVal->getType()->isIntegerTy())
58319 weight = CW_SpecificReg;
58320 break;
58321 case 'f':
58322 case 't':
58323 case 'u':
58324 if (type->isFloatingPointTy())
58325 weight = CW_SpecificReg;
58326 break;
58327 case 'y':
58328 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58329 weight = CW_SpecificReg;
58330 break;
58331 case 'Y':
58332 if (StringRef(constraint).size() != 2)
58333 break;
58334 switch (constraint[1]) {
58335 default:
58336 return CW_Invalid;
58337 // XMM0
58338 case 'z':
58339 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58340 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58341 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58342 return CW_SpecificReg;
58343 return CW_Invalid;
58344 // Conditional OpMask regs (AVX512)
58345 case 'k':
58346 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58347 return CW_Register;
58348 return CW_Invalid;
58349 // Any MMX reg
58350 case 'm':
58351 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58352 return weight;
58353 return CW_Invalid;
58354 // Any SSE reg when ISA >= SSE2, same as 'x'
58355 case 'i':
58356 case 't':
58357 case '2':
58358 if (!Subtarget.hasSSE2())
58359 return CW_Invalid;
58360 break;
58361 }
58362 break;
58363 case 'v':
58364 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58365 weight = CW_Register;
58366 [[fallthrough]];
58367 case 'x':
58368 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58369 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58370 weight = CW_Register;
58371 break;
58372 case 'k':
58373 // Enable conditional vector operations using %k<#> registers.
58374 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58375 weight = CW_Register;
58376 break;
58377 case 'I':
58378 if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
58379 if (C->getZExtValue() <= 31)
58380 weight = CW_Constant;
58381 }
58382 break;
58383 case 'J':
58384 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58385 if (C->getZExtValue() <= 63)
58386 weight = CW_Constant;
58387 }
58388 break;
58389 case 'K':
58390 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58391 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58392 weight = CW_Constant;
58393 }
58394 break;
58395 case 'L':
58396 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58397 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58398 weight = CW_Constant;
58399 }
58400 break;
58401 case 'M':
58402 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58403 if (C->getZExtValue() <= 3)
58404 weight = CW_Constant;
58405 }
58406 break;
58407 case 'N':
58408 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58409 if (C->getZExtValue() <= 0xff)
58410 weight = CW_Constant;
58411 }
58412 break;
58413 case 'G':
58414 case 'C':
58415 if (isa<ConstantFP>(CallOperandVal)) {
58416 weight = CW_Constant;
58417 }
58418 break;
58419 case 'e':
58420 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58421 if ((C->getSExtValue() >= -0x80000000LL) &&
58422 (C->getSExtValue() <= 0x7fffffffLL))
58423 weight = CW_Constant;
58424 }
58425 break;
58426 case 'Z':
58427 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58428 if (C->getZExtValue() <= 0xffffffff)
58429 weight = CW_Constant;
58430 }
58431 break;
58432 }
58433 return weight;
58434}
58435
58436/// Try to replace an X constraint, which matches anything, with another that
58437/// has more specific requirements based on the type of the corresponding
58438/// operand.
58439const char *X86TargetLowering::
58440LowerXConstraint(EVT ConstraintVT) const {
58441 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
58442 // 'f' like normal targets.
58443 if (ConstraintVT.isFloatingPoint()) {
58444 if (Subtarget.hasSSE1())
58445 return "x";
58446 }
58447
58448 return TargetLowering::LowerXConstraint(ConstraintVT);
58449}
58450
58451// Lower @cc targets via setcc.
58452SDValue X86TargetLowering::LowerAsmOutputForConstraint(
58453 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
58454 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
58455 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
58456 if (Cond == X86::COND_INVALID)
58457 return SDValue();
58458 // Check that return type is valid.
58459 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
58460 OpInfo.ConstraintVT.getSizeInBits() < 8)
58461 report_fatal_error("Glue output operand is of invalid type");
58462
58463 // Get EFLAGS register. Only update chain when copyfrom is glued.
58464 if (Glue.getNode()) {
58465 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
58466 Chain = Glue.getValue(1);
58467 } else
58468 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
58469 // Extract CC code.
58470 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
58471 // Extend to 32-bits
58472 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
58473
58474 return Result;
58475}
58476
58477/// Lower the specified operand into the Ops vector.
58478/// If it is invalid, don't add anything to Ops.
58479void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
58480 std::string &Constraint,
58481 std::vector<SDValue>&Ops,
58482 SelectionDAG &DAG) const {
58483 SDValue Result;
58484
58485 // Only support length 1 constraints for now.
58486 if (Constraint.length() > 1) return;
58487
58488 char ConstraintLetter = Constraint[0];
58489 switch (ConstraintLetter) {
58490 default: break;
58491 case 'I':
58492 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58493 if (C->getZExtValue() <= 31) {
58494 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58495 Op.getValueType());
58496 break;
58497 }
58498 }
58499 return;
58500 case 'J':
58501 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58502 if (C->getZExtValue() <= 63) {
58503 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58504 Op.getValueType());
58505 break;
58506 }
58507 }
58508 return;
58509 case 'K':
58510 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58511 if (isInt<8>(C->getSExtValue())) {
58512 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58513 Op.getValueType());
58514 break;
58515 }
58516 }
58517 return;
58518 case 'L':
58519 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58520 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58521 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58522 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58523 Op.getValueType());
58524 break;
58525 }
58526 }
58527 return;
58528 case 'M':
58529 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58530 if (C->getZExtValue() <= 3) {
58531 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58532 Op.getValueType());
58533 break;
58534 }
58535 }
58536 return;
58537 case 'N':
58538 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58539 if (C->getZExtValue() <= 255) {
58540 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58541 Op.getValueType());
58542 break;
58543 }
58544 }
58545 return;
58546 case 'O':
58547 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58548 if (C->getZExtValue() <= 127) {
58549 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58550 Op.getValueType());
58551 break;
58552 }
58553 }
58554 return;
58555 case 'e': {
58556 // 32-bit signed value
58557 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58558 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58559 C->getSExtValue())) {
58560 // Widen to 64 bits here to get it sign extended.
58561 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58562 break;
58563 }
58564 // FIXME gcc accepts some relocatable values here too, but only in certain
58565 // memory models; it's complicated.
58566 }
58567 return;
58568 }
58569 case 'Z': {
58570 // 32-bit unsigned value
58571 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58572 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58573 C->getZExtValue())) {
58574 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58575 Op.getValueType());
58576 break;
58577 }
58578 }
58579 // FIXME gcc accepts some relocatable values here too, but only in certain
58580 // memory models; it's complicated.
58581 return;
58582 }
58583 case 'i': {
58584 // Literal immediates are always ok.
58585 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
58586 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58587 BooleanContent BCont = getBooleanContents(MVT::i64);
58588 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
58589 : ISD::SIGN_EXTEND;
58590 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58591 : CST->getSExtValue();
58592 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
58593 break;
58594 }
58595
58596 // In any sort of PIC mode addresses need to be computed at runtime by
58597 // adding in a register or some sort of table lookup. These can't
58598 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
58599 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
58600 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
58601 return;
58602
58603 // If we are in non-pic codegen mode, we allow the address of a global (with
58604 // an optional displacement) to be used with 'i'.
58605 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58606 // If we require an extra load to get this address, as in PIC mode, we
58607 // can't accept it.
58608 if (isGlobalStubReference(
58609 Subtarget.classifyGlobalReference(GA->getGlobal())))
58610 return;
58611 break;
58612 }
58613 }
58614
58615 if (Result.getNode()) {
58616 Ops.push_back(Result);
58617 return;
58618 }
58619 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
58620}
58621
58622/// Check if \p RC is a general purpose register class.
58623/// I.e., GR* or one of their variant.
58624static bool isGRClass(const TargetRegisterClass &RC) {
58625 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
58626 RC.hasSuperClassEq(&X86::GR16RegClass) ||
58627 RC.hasSuperClassEq(&X86::GR32RegClass) ||
58628 RC.hasSuperClassEq(&X86::GR64RegClass) ||
58629 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
58630}
58631
58632/// Check if \p RC is a vector register class.
58633/// I.e., FR* / VR* or one of their variant.
58634static bool isFRClass(const TargetRegisterClass &RC) {
58635 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
58636 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
58637 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
58638 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
58639 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
58640 RC.hasSuperClassEq(&X86::VR512RegClass);
58641}
58642
58643/// Check if \p RC is a mask register class.
58644/// I.e., VK* or one of their variant.
58645static bool isVKClass(const TargetRegisterClass &RC) {
58646 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
58647 RC.hasSuperClassEq(&X86::VK2RegClass) ||
58648 RC.hasSuperClassEq(&X86::VK4RegClass) ||
58649 RC.hasSuperClassEq(&X86::VK8RegClass) ||
58650 RC.hasSuperClassEq(&X86::VK16RegClass) ||
58651 RC.hasSuperClassEq(&X86::VK32RegClass) ||
58652 RC.hasSuperClassEq(&X86::VK64RegClass);
58653}
58654
58655std::pair<unsigned, const TargetRegisterClass *>
58656X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
58657 StringRef Constraint,
58658 MVT VT) const {
58659 // First, see if this is a constraint that directly corresponds to an LLVM
58660 // register class.
58661 if (Constraint.size() == 1) {
58662 // GCC Constraint Letters
58663 switch (Constraint[0]) {
58664 default: break;
58665 // 'A' means [ER]AX + [ER]DX.
58666 case 'A':
58667 if (Subtarget.is64Bit())
58668 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
58669 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__
__PRETTY_FUNCTION__))
58670 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__
__PRETTY_FUNCTION__))
;
58671 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58672
58673 // TODO: Slight differences here in allocation order and leaving
58674 // RIP in the class. Do they matter any more here than they do
58675 // in the normal allocation?
58676 case 'k':
58677 if (Subtarget.hasAVX512()) {
58678 if (VT == MVT::i1)
58679 return std::make_pair(0U, &X86::VK1RegClass);
58680 if (VT == MVT::i8)
58681 return std::make_pair(0U, &X86::VK8RegClass);
58682 if (VT == MVT::i16)
58683 return std::make_pair(0U, &X86::VK16RegClass);
58684 }
58685 if (Subtarget.hasBWI()) {
58686 if (VT == MVT::i32)
58687 return std::make_pair(0U, &X86::VK32RegClass);
58688 if (VT == MVT::i64)
58689 return std::make_pair(0U, &X86::VK64RegClass);
58690 }
58691 break;
58692 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
58693 if (Subtarget.is64Bit()) {
58694 if (VT == MVT::i8 || VT == MVT::i1)
58695 return std::make_pair(0U, &X86::GR8RegClass);
58696 if (VT == MVT::i16)
58697 return std::make_pair(0U, &X86::GR16RegClass);
58698 if (VT == MVT::i32 || VT == MVT::f32)
58699 return std::make_pair(0U, &X86::GR32RegClass);
58700 if (VT != MVT::f80 && !VT.isVector())
58701 return std::make_pair(0U, &X86::GR64RegClass);
58702 break;
58703 }
58704 [[fallthrough]];
58705 // 32-bit fallthrough
58706 case 'Q': // Q_REGS
58707 if (VT == MVT::i8 || VT == MVT::i1)
58708 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
58709 if (VT == MVT::i16)
58710 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
58711 if (VT == MVT::i32 || VT == MVT::f32 ||
58712 (!VT.isVector() && !Subtarget.is64Bit()))
58713 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
58714 if (VT != MVT::f80 && !VT.isVector())
58715 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
58716 break;
58717 case 'r': // GENERAL_REGS
58718 case 'l': // INDEX_REGS
58719 if (VT == MVT::i8 || VT == MVT::i1)
58720 return std::make_pair(0U, &X86::GR8RegClass);
58721 if (VT == MVT::i16)
58722 return std::make_pair(0U, &X86::GR16RegClass);
58723 if (VT == MVT::i32 || VT == MVT::f32 ||
58724 (!VT.isVector() && !Subtarget.is64Bit()))
58725 return std::make_pair(0U, &X86::GR32RegClass);
58726 if (VT != MVT::f80 && !VT.isVector())
58727 return std::make_pair(0U, &X86::GR64RegClass);
58728 break;
58729 case 'R': // LEGACY_REGS
58730 if (VT == MVT::i8 || VT == MVT::i1)
58731 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
58732 if (VT == MVT::i16)
58733 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
58734 if (VT == MVT::i32 || VT == MVT::f32 ||
58735 (!VT.isVector() && !Subtarget.is64Bit()))
58736 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
58737 if (VT != MVT::f80 && !VT.isVector())
58738 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
58739 break;
58740 case 'f': // FP Stack registers.
58741 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
58742 // value to the correct fpstack register class.
58743 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
58744 return std::make_pair(0U, &X86::RFP32RegClass);
58745 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
58746 return std::make_pair(0U, &X86::RFP64RegClass);
58747 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
58748 return std::make_pair(0U, &X86::RFP80RegClass);
58749 break;
58750 case 'y': // MMX_REGS if MMX allowed.
58751 if (!Subtarget.hasMMX()) break;
58752 return std::make_pair(0U, &X86::VR64RegClass);
58753 case 'v':
58754 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
58755 if (!Subtarget.hasSSE1()) break;
58756 bool VConstraint = (Constraint[0] == 'v');
58757
58758 switch (VT.SimpleTy) {
58759 default: break;
58760 // Scalar SSE types.
58761 case MVT::f16:
58762 if (VConstraint && Subtarget.hasFP16())
58763 return std::make_pair(0U, &X86::FR16XRegClass);
58764 break;
58765 case MVT::f32:
58766 case MVT::i32:
58767 if (VConstraint && Subtarget.hasVLX())
58768 return std::make_pair(0U, &X86::FR32XRegClass);
58769 return std::make_pair(0U, &X86::FR32RegClass);
58770 case MVT::f64:
58771 case MVT::i64:
58772 if (VConstraint && Subtarget.hasVLX())
58773 return std::make_pair(0U, &X86::FR64XRegClass);
58774 return std::make_pair(0U, &X86::FR64RegClass);
58775 case MVT::i128:
58776 if (Subtarget.is64Bit()) {
58777 if (VConstraint && Subtarget.hasVLX())
58778 return std::make_pair(0U, &X86::VR128XRegClass);
58779 return std::make_pair(0U, &X86::VR128RegClass);
58780 }
58781 break;
58782 // Vector types and fp128.
58783 case MVT::v8f16:
58784 if (!Subtarget.hasFP16())
58785 break;
58786 [[fallthrough]];
58787 case MVT::f128:
58788 case MVT::v16i8:
58789 case MVT::v8i16:
58790 case MVT::v4i32:
58791 case MVT::v2i64:
58792 case MVT::v4f32:
58793 case MVT::v2f64:
58794 if (VConstraint && Subtarget.hasVLX())
58795 return std::make_pair(0U, &X86::VR128XRegClass);
58796 return std::make_pair(0U, &X86::VR128RegClass);
58797 // AVX types.
58798 case MVT::v16f16:
58799 if (!Subtarget.hasFP16())
58800 break;
58801 [[fallthrough]];
58802 case MVT::v32i8:
58803 case MVT::v16i16:
58804 case MVT::v8i32:
58805 case MVT::v4i64:
58806 case MVT::v8f32:
58807 case MVT::v4f64:
58808 if (VConstraint && Subtarget.hasVLX())
58809 return std::make_pair(0U, &X86::VR256XRegClass);
58810 if (Subtarget.hasAVX())
58811 return std::make_pair(0U, &X86::VR256RegClass);
58812 break;
58813 case MVT::v32f16:
58814 if (!Subtarget.hasFP16())
58815 break;
58816 [[fallthrough]];
58817 case MVT::v64i8:
58818 case MVT::v32i16:
58819 case MVT::v8f64:
58820 case MVT::v16f32:
58821 case MVT::v16i32:
58822 case MVT::v8i64:
58823 if (!Subtarget.hasAVX512()) break;
58824 if (VConstraint)
58825 return std::make_pair(0U, &X86::VR512RegClass);
58826 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58827 }
58828 break;
58829 }
58830 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
58831 switch (Constraint[1]) {
58832 default:
58833 break;
58834 case 'i':
58835 case 't':
58836 case '2':
58837 return getRegForInlineAsmConstraint(TRI, "x", VT);
58838 case 'm':
58839 if (!Subtarget.hasMMX()) break;
58840 return std::make_pair(0U, &X86::VR64RegClass);
58841 case 'z':
58842 if (!Subtarget.hasSSE1()) break;
58843 switch (VT.SimpleTy) {
58844 default: break;
58845 // Scalar SSE types.
58846 case MVT::f16:
58847 if (!Subtarget.hasFP16())
58848 break;
58849 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58850 case MVT::f32:
58851 case MVT::i32:
58852 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58853 case MVT::f64:
58854 case MVT::i64:
58855 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58856 case MVT::v8f16:
58857 if (!Subtarget.hasFP16())
58858 break;
58859 [[fallthrough]];
58860 case MVT::f128:
58861 case MVT::v16i8:
58862 case MVT::v8i16:
58863 case MVT::v4i32:
58864 case MVT::v2i64:
58865 case MVT::v4f32:
58866 case MVT::v2f64:
58867 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58868 // AVX types.
58869 case MVT::v16f16:
58870 if (!Subtarget.hasFP16())
58871 break;
58872 [[fallthrough]];
58873 case MVT::v32i8:
58874 case MVT::v16i16:
58875 case MVT::v8i32:
58876 case MVT::v4i64:
58877 case MVT::v8f32:
58878 case MVT::v4f64:
58879 if (Subtarget.hasAVX())
58880 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58881 break;
58882 case MVT::v32f16:
58883 if (!Subtarget.hasFP16())
58884 break;
58885 [[fallthrough]];
58886 case MVT::v64i8:
58887 case MVT::v32i16:
58888 case MVT::v8f64:
58889 case MVT::v16f32:
58890 case MVT::v16i32:
58891 case MVT::v8i64:
58892 if (Subtarget.hasAVX512())
58893 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58894 break;
58895 }
58896 break;
58897 case 'k':
58898 // This register class doesn't allocate k0 for masked vector operation.
58899 if (Subtarget.hasAVX512()) {
58900 if (VT == MVT::i1)
58901 return std::make_pair(0U, &X86::VK1WMRegClass);
58902 if (VT == MVT::i8)
58903 return std::make_pair(0U, &X86::VK8WMRegClass);
58904 if (VT == MVT::i16)
58905 return std::make_pair(0U, &X86::VK16WMRegClass);
58906 }
58907 if (Subtarget.hasBWI()) {
58908 if (VT == MVT::i32)
58909 return std::make_pair(0U, &X86::VK32WMRegClass);
58910 if (VT == MVT::i64)
58911 return std::make_pair(0U, &X86::VK64WMRegClass);
58912 }
58913 break;
58914 }
58915 }
58916
58917 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58918 return std::make_pair(0U, &X86::GR32RegClass);
58919
58920 // Use the default implementation in TargetLowering to convert the register
58921 // constraint into a member of a register class.
58922 std::pair<Register, const TargetRegisterClass*> Res;
58923 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
58924
58925 // Not found as a standard register?
58926 if (!Res.second) {
58927 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58928 // to/from f80.
58929 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58930 // Map st(0) -> st(7) -> ST0
58931 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58932 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58933 Constraint[3] == '(' &&
58934 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58935 Constraint[5] == ')' && Constraint[6] == '}') {
58936 // st(7) is not allocatable and thus not a member of RFP80. Return
58937 // singleton class in cases where we have a reference to it.
58938 if (Constraint[4] == '7')
58939 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58940 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58941 &X86::RFP80RegClass);
58942 }
58943
58944 // GCC allows "st(0)" to be called just plain "st".
58945 if (StringRef("{st}").equals_insensitive(Constraint))
58946 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58947 }
58948
58949 // flags -> EFLAGS
58950 if (StringRef("{flags}").equals_insensitive(Constraint))
58951 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58952
58953 // dirflag -> DF
58954 // Only allow for clobber.
58955 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58956 VT == MVT::Other)
58957 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58958
58959 // fpsr -> FPSW
58960 if (StringRef("{fpsr}").equals_insensitive(Constraint))
58961 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58962
58963 return Res;
58964 }
58965
58966 // Make sure it isn't a register that requires 64-bit mode.
58967 if (!Subtarget.is64Bit() &&
58968 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58969 TRI->getEncodingValue(Res.first) >= 8) {
58970 // Register requires REX prefix, but we're in 32-bit mode.
58971 return std::make_pair(0, nullptr);
58972 }
58973
58974 // Make sure it isn't a register that requires AVX512.
58975 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58976 TRI->getEncodingValue(Res.first) & 0x10) {
58977 // Register requires EVEX prefix.
58978 return std::make_pair(0, nullptr);
58979 }
58980
58981 // Otherwise, check to see if this is a register class of the wrong value
58982 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58983 // turn into {ax},{dx}.
58984 // MVT::Other is used to specify clobber names.
58985 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58986 return Res; // Correct type already, nothing to do.
58987
58988 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58989 // return "eax". This should even work for things like getting 64bit integer
58990 // registers when given an f64 type.
58991 const TargetRegisterClass *Class = Res.second;
58992 // The generic code will match the first register class that contains the
58993 // given register. Thus, based on the ordering of the tablegened file,
58994 // the "plain" GR classes might not come first.
58995 // Therefore, use a helper method.
58996 if (isGRClass(*Class)) {
58997 unsigned Size = VT.getSizeInBits();
58998 if (Size == 1) Size = 8;
58999 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
59000 return std::make_pair(0, nullptr);
59001 Register DestReg = getX86SubSuperRegister(Res.first, Size);
59002 if (DestReg.isValid()) {
59003 bool is64Bit = Subtarget.is64Bit();
59004 const TargetRegisterClass *RC =
59005 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
59006 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
59007 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
59008 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
59009 if (Size == 64 && !is64Bit) {
59010 // Model GCC's behavior here and select a fixed pair of 32-bit
59011 // registers.
59012 switch (DestReg) {
59013 case X86::RAX:
59014 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
59015 case X86::RDX:
59016 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
59017 case X86::RCX:
59018 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
59019 case X86::RBX:
59020 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
59021 case X86::RSI:
59022 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
59023 case X86::RDI:
59024 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
59025 case X86::RBP:
59026 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
59027 default:
59028 return std::make_pair(0, nullptr);
59029 }
59030 }
59031 if (RC && RC->contains(DestReg))
59032 return std::make_pair(DestReg, RC);
59033 return Res;
59034 }
59035 // No register found/type mismatch.
59036 return std::make_pair(0, nullptr);
59037 } else if (isFRClass(*Class)) {
59038 // Handle references to XMM physical registers that got mapped into the
59039 // wrong class. This can happen with constraints like {xmm0} where the
59040 // target independent register mapper will just pick the first match it can
59041 // find, ignoring the required type.
59042
59043 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
59044 if (VT == MVT::f16)
59045 Res.second = &X86::FR16XRegClass;
59046 else if (VT == MVT::f32 || VT == MVT::i32)
59047 Res.second = &X86::FR32XRegClass;
59048 else if (VT == MVT::f64 || VT == MVT::i64)
59049 Res.second = &X86::FR64XRegClass;
59050 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
59051 Res.second = &X86::VR128XRegClass;
59052 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
59053 Res.second = &X86::VR256XRegClass;
59054 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
59055 Res.second = &X86::VR512RegClass;
59056 else {
59057 // Type mismatch and not a clobber: Return an error;
59058 Res.first = 0;
59059 Res.second = nullptr;
59060 }
59061 } else if (isVKClass(*Class)) {
59062 if (VT == MVT::i1)
59063 Res.second = &X86::VK1RegClass;
59064 else if (VT == MVT::i8)
59065 Res.second = &X86::VK8RegClass;
59066 else if (VT == MVT::i16)
59067 Res.second = &X86::VK16RegClass;
59068 else if (VT == MVT::i32)
59069 Res.second = &X86::VK32RegClass;
59070 else if (VT == MVT::i64)
59071 Res.second = &X86::VK64RegClass;
59072 else {
59073 // Type mismatch and not a clobber: Return an error;
59074 Res.first = 0;
59075 Res.second = nullptr;
59076 }
59077 }
59078
59079 return Res;
59080}
59081
59082bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
59083 // Integer division on x86 is expensive. However, when aggressively optimizing
59084 // for code size, we prefer to use a div instruction, as it is usually smaller
59085 // than the alternative sequence.
59086 // The exception to this is vector division. Since x86 doesn't have vector
59087 // integer division, leaving the division as-is is a loss even in terms of
59088 // size, because it will have to be scalarized, while the alternative code
59089 // sequence can be performed in vector form.
59090 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
59091 return OptSize && !VT.isVector();
59092}
59093
59094void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
59095 if (!Subtarget.is64Bit())
59096 return;
59097
59098 // Update IsSplitCSR in X86MachineFunctionInfo.
59099 X86MachineFunctionInfo *AFI =
59100 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
59101 AFI->setIsSplitCSR(true);
59102}
59103
59104void X86TargetLowering::insertCopiesSplitCSR(
59105 MachineBasicBlock *Entry,
59106 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
59107 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
59108 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
59109 if (!IStart)
59110 return;
59111
59112 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
59113 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
59114 MachineBasicBlock::iterator MBBI = Entry->begin();
59115 for (const MCPhysReg *I = IStart; *I; ++I) {
59116 const TargetRegisterClass *RC = nullptr;
59117 if (X86::GR64RegClass.contains(*I))
59118 RC = &X86::GR64RegClass;
59119 else
59120 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59120)
;
59121
59122 Register NewVR = MRI->createVirtualRegister(RC);
59123 // Create copy from CSR to a virtual register.
59124 // FIXME: this currently does not emit CFI pseudo-instructions, it works
59125 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
59126 // nounwind. If we want to generalize this later, we may need to emit
59127 // CFI pseudo-instructions.
59128 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))
59129 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))
59130 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))
;
59131 Entry->addLiveIn(*I);
59132 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
59133 .addReg(*I);
59134
59135 // Insert the copy-back instructions right before the terminator.
59136 for (auto *Exit : Exits)
59137 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
59138 TII->get(TargetOpcode::COPY), *I)
59139 .addReg(NewVR);
59140 }
59141}
59142
59143bool X86TargetLowering::supportSwiftError() const {
59144 return Subtarget.is64Bit();
59145}
59146
59147MachineInstr *
59148X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
59149 MachineBasicBlock::instr_iterator &MBBI,
59150 const TargetInstrInfo *TII) const {
59151 assert(MBBI->isCall() && MBBI->getCFIType() &&(static_cast <bool> (MBBI->isCall() && MBBI->
getCFIType() && "Invalid call instruction for a KCFI check"
) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__
__PRETTY_FUNCTION__))
59152 "Invalid call instruction for a KCFI check")(static_cast <bool> (MBBI->isCall() && MBBI->
getCFIType() && "Invalid call instruction for a KCFI check"
) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__
__PRETTY_FUNCTION__))
;
59153
59154 MachineFunction &MF = *MBB.getParent();
59155 // If the call target is a memory operand, unfold it and use R11 for the
59156 // call, so KCFI_CHECK won't have to recompute the address.
59157 switch (MBBI->getOpcode()) {
59158 case X86::CALL64m:
59159 case X86::CALL64m_NT:
59160 case X86::TAILJMPm64:
59161 case X86::TAILJMPm64_REX: {
59162 MachineBasicBlock::instr_iterator OrigCall = MBBI;
59163 SmallVector<MachineInstr *, 2> NewMIs;
59164 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
59165 /*UnfoldStore=*/false, NewMIs))
59166 report_fatal_error("Failed to unfold memory operand for a KCFI check");
59167 for (auto *NewMI : NewMIs)
59168 MBBI = MBB.insert(OrigCall, NewMI);
59169 assert(MBBI->isCall() &&(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding"
) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__
__PRETTY_FUNCTION__))
59170 "Unexpected instruction after memory operand unfolding")(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding"
) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__
__PRETTY_FUNCTION__))
;
59171 if (OrigCall->shouldUpdateCallSiteInfo())
59172 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
59173 MBBI->setCFIType(MF, OrigCall->getCFIType());
59174 OrigCall->eraseFromParent();
59175 break;
59176 }
59177 default:
59178 break;
59179 }
59180
59181 MachineOperand &Target = MBBI->getOperand(0);
59182 Register TargetReg;
59183 switch (MBBI->getOpcode()) {
59184 case X86::CALL64r:
59185 case X86::CALL64r_NT:
59186 case X86::TAILJMPr64:
59187 case X86::TAILJMPr64_REX:
59188 assert(Target.isReg() && "Unexpected target operand for an indirect call")(static_cast <bool> (Target.isReg() && "Unexpected target operand for an indirect call"
) ? void (0) : __assert_fail ("Target.isReg() && \"Unexpected target operand for an indirect call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59188, __extension__
__PRETTY_FUNCTION__))
;
59189 Target.setIsRenamable(false);
59190 TargetReg = Target.getReg();
59191 break;
59192 case X86::CALL64pcrel32:
59193 case X86::TAILJMPd64:
59194 assert(Target.isSymbol() && "Unexpected target operand for a direct call")(static_cast <bool> (Target.isSymbol() && "Unexpected target operand for a direct call"
) ? void (0) : __assert_fail ("Target.isSymbol() && \"Unexpected target operand for a direct call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59194, __extension__
__PRETTY_FUNCTION__))
;
59195 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
59196 // 64-bit indirect thunk calls.
59197 assert(StringRef(Target.getSymbolName()).endswith("_r11") &&(static_cast <bool> (StringRef(Target.getSymbolName()).
endswith("_r11") && "Unexpected register for an indirect thunk call"
) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__
__PRETTY_FUNCTION__))
59198 "Unexpected register for an indirect thunk call")(static_cast <bool> (StringRef(Target.getSymbolName()).
endswith("_r11") && "Unexpected register for an indirect thunk call"
) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__
__PRETTY_FUNCTION__))
;
59199 TargetReg = X86::R11;
59200 break;
59201 default:
59202 llvm_unreachable("Unexpected CFI call opcode")::llvm::llvm_unreachable_internal("Unexpected CFI call opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59202)
;
59203 break;
59204 }
59205
59206 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(X86::KCFI_CHECK))
59207 .addReg(TargetReg)
59208 .addImm(MBBI->getCFIType())
59209 .getInstr();
59210}
59211
59212/// Returns true if stack probing through a function call is requested.
59213bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
59214 return !getStackProbeSymbolName(MF).empty();
59215}
59216
59217/// Returns true if stack probing through inline assembly is requested.
59218bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
59219
59220 // No inline stack probe for Windows, they have their own mechanism.
59221 if (Subtarget.isOSWindows() ||
59222 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59223 return false;
59224
59225 // If the function specifically requests inline stack probes, emit them.
59226 if (MF.getFunction().hasFnAttribute("probe-stack"))
59227 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
59228 "inline-asm";
59229
59230 return false;
59231}
59232
59233/// Returns the name of the symbol used to emit stack probes or the empty
59234/// string if not applicable.
59235StringRef
59236X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
59237 // Inline Stack probes disable stack probe call
59238 if (hasInlineStackProbe(MF))
59239 return "";
59240
59241 // If the function specifically requests stack probes, emit them.
59242 if (MF.getFunction().hasFnAttribute("probe-stack"))
59243 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
59244
59245 // Generally, if we aren't on Windows, the platform ABI does not include
59246 // support for stack probes, so don't emit them.
59247 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
59248 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59249 return "";
59250
59251 // We need a stack probe to conform to the Windows ABI. Choose the right
59252 // symbol.
59253 if (Subtarget.is64Bit())
59254 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
59255 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
59256}
59257
59258unsigned
59259X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
59260 // The default stack probe size is 4096 if the function has no stackprobesize
59261 // attribute.
59262 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
59263 4096);
59264}
59265
59266Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
59267 if (ML->isInnermost() &&
59268 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
59269 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
59270 return TargetLowering::getPrefLoopAlignment();
59271}