Bug Summary

File:build/source/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
Warning:line 1178, column 10
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/X86 -I /build/source/llvm/lib/Target/X86 -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/X86/X86ISelLowering.cpp

/build/source/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/ObjCARCUtil.h"
31#include "llvm/Analysis/ProfileSummaryInfo.h"
32#include "llvm/Analysis/VectorUtils.h"
33#include "llvm/CodeGen/IntrinsicLowering.h"
34#include "llvm/CodeGen/MachineFrameInfo.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineJumpTableInfo.h"
38#include "llvm/CodeGen/MachineLoopInfo.h"
39#include "llvm/CodeGen/MachineModuleInfo.h"
40#include "llvm/CodeGen/MachineRegisterInfo.h"
41#include "llvm/CodeGen/TargetLowering.h"
42#include "llvm/CodeGen/WinEHFuncInfo.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
45#include "llvm/IR/DerivedTypes.h"
46#include "llvm/IR/DiagnosticInfo.h"
47#include "llvm/IR/EHPersonalities.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107/// Returns true if a CC can dynamically exclude a register from the list of
108/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
109/// the return registers.
110static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
111 switch (CC) {
112 default:
113 return false;
114 case CallingConv::X86_RegCall:
115 case CallingConv::PreserveMost:
116 case CallingConv::PreserveAll:
117 return true;
118 }
119}
120
121/// Returns true if a CC can dynamically exclude a register from the list of
122/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
123/// the parameters.
124static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
125 return CC == CallingConv::X86_RegCall;
126}
127
128X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
129 const X86Subtarget &STI)
130 : TargetLowering(TM), Subtarget(STI) {
131 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
132 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
133
134 // Set up the TargetLowering object.
135
136 // X86 is weird. It always uses i8 for shift amounts and setcc results.
137 setBooleanContents(ZeroOrOneBooleanContent);
138 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
139 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
140
141 // For 64-bit, since we have so many registers, use the ILP scheduler.
142 // For 32-bit, use the register pressure specific scheduling.
143 // For Atom, always use ILP scheduling.
144 if (Subtarget.isAtom())
145 setSchedulingPreference(Sched::ILP);
146 else if (Subtarget.is64Bit())
147 setSchedulingPreference(Sched::ILP);
148 else
149 setSchedulingPreference(Sched::RegPressure);
150 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
151 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
152
153 // Bypass expensive divides and use cheaper ones.
154 if (TM.getOptLevel() >= CodeGenOpt::Default) {
155 if (Subtarget.hasSlowDivide32())
156 addBypassSlowDiv(32, 8);
157 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
158 addBypassSlowDiv(64, 32);
159 }
160
161 // Setup Windows compiler runtime calls.
162 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
163 static const struct {
164 const RTLIB::Libcall Op;
165 const char * const Name;
166 const CallingConv::ID CC;
167 } LibraryCalls[] = {
168 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
169 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
170 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
171 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
172 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
173 };
174
175 for (const auto &LC : LibraryCalls) {
176 setLibcallName(LC.Op, LC.Name);
177 setLibcallCallingConv(LC.Op, LC.CC);
178 }
179 }
180
181 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
182 // MSVCRT doesn't have powi; fall back to pow
183 setLibcallName(RTLIB::POWI_F32, nullptr);
184 setLibcallName(RTLIB::POWI_F64, nullptr);
185 }
186
187 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
188 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
189 // FIXME: Should we be limiting the atomic size on other configs? Default is
190 // 1024.
191 if (!Subtarget.canUseCMPXCHG8B())
192 setMaxAtomicSizeInBitsSupported(32);
193
194 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
195
196 setMaxLargeFPConvertBitWidthSupported(128);
197
198 // Set up the register classes.
199 addRegisterClass(MVT::i8, &X86::GR8RegClass);
200 addRegisterClass(MVT::i16, &X86::GR16RegClass);
201 addRegisterClass(MVT::i32, &X86::GR32RegClass);
202 if (Subtarget.is64Bit())
203 addRegisterClass(MVT::i64, &X86::GR64RegClass);
204
205 for (MVT VT : MVT::integer_valuetypes())
206 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
207
208 // We don't accept any truncstore of integer registers.
209 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
211 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
212 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
213 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
214 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
215
216 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
217
218 // SETOEQ and SETUNE require checking two conditions.
219 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
220 setCondCodeAction(ISD::SETOEQ, VT, Expand);
221 setCondCodeAction(ISD::SETUNE, VT, Expand);
222 }
223
224 // Integer absolute.
225 if (Subtarget.canUseCMOV()) {
226 setOperationAction(ISD::ABS , MVT::i16 , Custom);
227 setOperationAction(ISD::ABS , MVT::i32 , Custom);
228 if (Subtarget.is64Bit())
229 setOperationAction(ISD::ABS , MVT::i64 , Custom);
230 }
231
232 // Absolute difference.
233 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
234 setOperationAction(Op , MVT::i8 , Custom);
235 setOperationAction(Op , MVT::i16 , Custom);
236 setOperationAction(Op , MVT::i32 , Custom);
237 if (Subtarget.is64Bit())
238 setOperationAction(Op , MVT::i64 , Custom);
239 }
240
241 // Signed saturation subtraction.
242 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
243 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
244 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
245 if (Subtarget.is64Bit())
246 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
247
248 // Funnel shifts.
249 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
250 // For slow shld targets we only lower for code size.
251 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
252
253 setOperationAction(ShiftOp , MVT::i8 , Custom);
254 setOperationAction(ShiftOp , MVT::i16 , Custom);
255 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
256 if (Subtarget.is64Bit())
257 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
258 }
259
260 if (!Subtarget.useSoftFloat()) {
261 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
262 // operation.
263 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
264 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
265 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
266 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
267 // We have an algorithm for SSE2, and we turn this into a 64-bit
268 // FILD or VCVTUSI2SS/SD for other targets.
269 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
270 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
271 // We have an algorithm for SSE2->double, and we turn this into a
272 // 64-bit FILD followed by conditional FADD for other targets.
273 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
274 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
275
276 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
277 // this operation.
278 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
279 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
280 // SSE has no i16 to fp conversion, only i32. We promote in the handler
281 // to allow f80 to use i16 and f64 to use i16 with sse1 only
282 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
283 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
284 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
285 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
286 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
287 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
288 // are Legal, f80 is custom lowered.
289 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
290 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
291
292 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
293 // this operation.
294 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
295 // FIXME: This doesn't generate invalid exception when it should. PR44019.
296 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
297 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
298 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
299 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
300 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
301 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
302 // are Legal, f80 is custom lowered.
303 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
304 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
305
306 // Handle FP_TO_UINT by promoting the destination to a larger signed
307 // conversion.
308 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
309 // FIXME: This doesn't generate invalid exception when it should. PR44019.
310 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
311 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
312 // FIXME: This doesn't generate invalid exception when it should. PR44019.
313 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
314 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
315 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
316 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
317 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
318
319 setOperationAction(ISD::LRINT, MVT::f32, Custom);
320 setOperationAction(ISD::LRINT, MVT::f64, Custom);
321 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
322 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
323
324 if (!Subtarget.is64Bit()) {
325 setOperationAction(ISD::LRINT, MVT::i64, Custom);
326 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
327 }
328 }
329
330 if (Subtarget.hasSSE2()) {
331 // Custom lowering for saturating float to int conversions.
332 // We handle promotion to larger result types manually.
333 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
334 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
335 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
336 }
337 if (Subtarget.is64Bit()) {
338 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
339 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
340 }
341 }
342
343 // Handle address space casts between mixed sized pointers.
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
345 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
346
347 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
348 if (!Subtarget.hasSSE2()) {
349 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
350 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
351 if (Subtarget.is64Bit()) {
352 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
353 // Without SSE, i64->f64 goes through memory.
354 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
355 }
356 } else if (!Subtarget.is64Bit())
357 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
358
359 // Scalar integer divide and remainder are lowered to use operations that
360 // produce two results, to match the available instructions. This exposes
361 // the two-result form to trivial CSE, which is able to combine x/y and x%y
362 // into a single instruction.
363 //
364 // Scalar integer multiply-high is also lowered to use two-result
365 // operations, to match the available instructions. However, plain multiply
366 // (low) operations are left as Legal, as there are single-result
367 // instructions for this in x86. Using the two-result multiply instructions
368 // when both high and low results are needed must be arranged by dagcombine.
369 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
370 setOperationAction(ISD::MULHS, VT, Expand);
371 setOperationAction(ISD::MULHU, VT, Expand);
372 setOperationAction(ISD::SDIV, VT, Expand);
373 setOperationAction(ISD::UDIV, VT, Expand);
374 setOperationAction(ISD::SREM, VT, Expand);
375 setOperationAction(ISD::UREM, VT, Expand);
376 }
377
378 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
379 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
380 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
381 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
382 setOperationAction(ISD::BR_CC, VT, Expand);
383 setOperationAction(ISD::SELECT_CC, VT, Expand);
384 }
385 if (Subtarget.is64Bit())
386 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
388 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
389 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
390
391 setOperationAction(ISD::FREM , MVT::f32 , Expand);
392 setOperationAction(ISD::FREM , MVT::f64 , Expand);
393 setOperationAction(ISD::FREM , MVT::f80 , Expand);
394 setOperationAction(ISD::FREM , MVT::f128 , Expand);
395
396 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
397 setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);
398 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
399 }
400
401 // Promote the i8 variants and force them on up to i32 which has a shorter
402 // encoding.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
404 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
405 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
406 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
407 // promote that too.
408 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
409 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);
410
411 if (!Subtarget.hasBMI()) {
412 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
413 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
414 if (Subtarget.is64Bit()) {
415 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
416 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
417 }
418 }
419
420 if (Subtarget.hasLZCNT()) {
421 // When promoting the i8 variants, force them to i32 for a shorter
422 // encoding.
423 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
424 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
425 } else {
426 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
427 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 continue;
429 setOperationAction(ISD::CTLZ , VT, Custom);
430 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
431 }
432 }
433
434 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
435 ISD::STRICT_FP_TO_FP16}) {
436 // Special handling for half-precision floating point conversions.
437 // If we don't have F16C support, then lower half float conversions
438 // into library calls.
439 setOperationAction(
440 Op, MVT::f32,
441 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
442 // There's never any support for operations beyond MVT::f32.
443 setOperationAction(Op, MVT::f64, Expand);
444 setOperationAction(Op, MVT::f80, Expand);
445 setOperationAction(Op, MVT::f128, Expand);
446 }
447
448 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
449 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
450 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
451 setTruncStoreAction(VT, MVT::f16, Expand);
452 setTruncStoreAction(VT, MVT::bf16, Expand);
453
454 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
455 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
456 }
457
458 setOperationAction(ISD::PARITY, MVT::i8, Custom);
459 setOperationAction(ISD::PARITY, MVT::i16, Custom);
460 setOperationAction(ISD::PARITY, MVT::i32, Custom);
461 if (Subtarget.is64Bit())
462 setOperationAction(ISD::PARITY, MVT::i64, Custom);
463 if (Subtarget.hasPOPCNT()) {
464 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
465 // popcntw is longer to encode than popcntl and also has a false dependency
466 // on the dest that popcntl hasn't had since Cannon Lake.
467 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
468 } else {
469 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
470 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
471 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
472 if (Subtarget.is64Bit())
473 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
474 else
475 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
476 }
477
478 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
479
480 if (!Subtarget.hasMOVBE())
481 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
482
483 // X86 wants to expand cmov itself.
484 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
485 setOperationAction(ISD::SELECT, VT, Custom);
486 setOperationAction(ISD::SETCC, VT, Custom);
487 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
488 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
489 }
490 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
491 if (VT == MVT::i64 && !Subtarget.is64Bit())
492 continue;
493 setOperationAction(ISD::SELECT, VT, Custom);
494 setOperationAction(ISD::SETCC, VT, Custom);
495 }
496
497 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
498 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
499 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
500
501 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
502 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
503 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
504 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
505 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
506 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
507 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
508 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
509
510 // Darwin ABI issue.
511 for (auto VT : { MVT::i32, MVT::i64 }) {
512 if (VT == MVT::i64 && !Subtarget.is64Bit())
513 continue;
514 setOperationAction(ISD::ConstantPool , VT, Custom);
515 setOperationAction(ISD::JumpTable , VT, Custom);
516 setOperationAction(ISD::GlobalAddress , VT, Custom);
517 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
518 setOperationAction(ISD::ExternalSymbol , VT, Custom);
519 setOperationAction(ISD::BlockAddress , VT, Custom);
520 }
521
522 // 64-bit shl, sra, srl (iff 32-bit x86)
523 for (auto VT : { MVT::i32, MVT::i64 }) {
524 if (VT == MVT::i64 && !Subtarget.is64Bit())
525 continue;
526 setOperationAction(ISD::SHL_PARTS, VT, Custom);
527 setOperationAction(ISD::SRA_PARTS, VT, Custom);
528 setOperationAction(ISD::SRL_PARTS, VT, Custom);
529 }
530
531 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
532 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
533
534 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
535
536 // Expand certain atomics
537 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
538 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
539 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
540 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
541 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
542 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
544 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
545 }
546
547 if (!Subtarget.is64Bit())
548 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
549
550 if (Subtarget.canUseCMPXCHG16B())
551 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
552
553 // FIXME - use subtarget debug flags
554 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
555 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
556 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
557 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
558 }
559
560 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
561 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
562
563 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
564 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
565
566 setOperationAction(ISD::TRAP, MVT::Other, Legal);
567 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
568 if (Subtarget.isTargetPS())
569 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
570 else
571 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
572
573 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
574 setOperationAction(ISD::VASTART , MVT::Other, Custom);
575 setOperationAction(ISD::VAEND , MVT::Other, Expand);
576 bool Is64Bit = Subtarget.is64Bit();
577 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
578 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
579
580 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
581 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
582
583 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
584
585 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
586 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
587 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
588
589 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
590
591 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
592 setOperationAction(ISD::FABS, VT, Action);
593 setOperationAction(ISD::FNEG, VT, Action);
594 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
595 setOperationAction(ISD::FREM, VT, Action);
596 setOperationAction(ISD::FMA, VT, Action);
597 setOperationAction(ISD::FMINNUM, VT, Action);
598 setOperationAction(ISD::FMAXNUM, VT, Action);
599 setOperationAction(ISD::FMINIMUM, VT, Action);
600 setOperationAction(ISD::FMAXIMUM, VT, Action);
601 setOperationAction(ISD::FSIN, VT, Action);
602 setOperationAction(ISD::FCOS, VT, Action);
603 setOperationAction(ISD::FSINCOS, VT, Action);
604 setOperationAction(ISD::FSQRT, VT, Action);
605 setOperationAction(ISD::FPOW, VT, Action);
606 setOperationAction(ISD::FLOG, VT, Action);
607 setOperationAction(ISD::FLOG2, VT, Action);
608 setOperationAction(ISD::FLOG10, VT, Action);
609 setOperationAction(ISD::FEXP, VT, Action);
610 setOperationAction(ISD::FEXP2, VT, Action);
611 setOperationAction(ISD::FCEIL, VT, Action);
612 setOperationAction(ISD::FFLOOR, VT, Action);
613 setOperationAction(ISD::FNEARBYINT, VT, Action);
614 setOperationAction(ISD::FRINT, VT, Action);
615 setOperationAction(ISD::BR_CC, VT, Action);
616 setOperationAction(ISD::SETCC, VT, Action);
617 setOperationAction(ISD::SELECT, VT, Custom);
618 setOperationAction(ISD::SELECT_CC, VT, Action);
619 setOperationAction(ISD::FROUND, VT, Action);
620 setOperationAction(ISD::FROUNDEVEN, VT, Action);
621 setOperationAction(ISD::FTRUNC, VT, Action);
622 };
623
624 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
625 // f16, f32 and f64 use SSE.
626 // Set up the FP register classes.
627 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
628 : &X86::FR16RegClass);
629 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
630 : &X86::FR32RegClass);
631 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
632 : &X86::FR64RegClass);
633
634 // Disable f32->f64 extload as we can only generate this in one instruction
635 // under optsize. So its easier to pattern match (fpext (load)) for that
636 // case instead of needing to emit 2 instructions for extload in the
637 // non-optsize case.
638 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
639
640 for (auto VT : { MVT::f32, MVT::f64 }) {
641 // Use ANDPD to simulate FABS.
642 setOperationAction(ISD::FABS, VT, Custom);
643
644 // Use XORP to simulate FNEG.
645 setOperationAction(ISD::FNEG, VT, Custom);
646
647 // Use ANDPD and ORPD to simulate FCOPYSIGN.
648 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
649
650 // These might be better off as horizontal vector ops.
651 setOperationAction(ISD::FADD, VT, Custom);
652 setOperationAction(ISD::FSUB, VT, Custom);
653
654 // We don't support sin/cos/fmod
655 setOperationAction(ISD::FSIN , VT, Expand);
656 setOperationAction(ISD::FCOS , VT, Expand);
657 setOperationAction(ISD::FSINCOS, VT, Expand);
658 }
659
660 // Half type will be promoted by default.
661 setF16Action(MVT::f16, Promote);
662 setOperationAction(ISD::FADD, MVT::f16, Promote);
663 setOperationAction(ISD::FSUB, MVT::f16, Promote);
664 setOperationAction(ISD::FMUL, MVT::f16, Promote);
665 setOperationAction(ISD::FDIV, MVT::f16, Promote);
666 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
667 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
668 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
669
670 setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
671 setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
672 setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
673 setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
674 setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
675 setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
676 setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
677 setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
678 setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
679 setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
680 setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
681 setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
682 setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
683 setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
684 setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
685 setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
686 setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
687 setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
688 setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
689 setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
690 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
691 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
692 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
693 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
694 setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
695 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
696 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
697 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
698
699 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
700 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
701
702 // Lower this to MOVMSK plus an AND.
703 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
704 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
705
706 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
707 (UseX87 || Is64Bit)) {
708 // Use SSE for f32, x87 for f64.
709 // Set up the FP register classes.
710 addRegisterClass(MVT::f32, &X86::FR32RegClass);
711 if (UseX87)
712 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
713
714 // Use ANDPS to simulate FABS.
715 setOperationAction(ISD::FABS , MVT::f32, Custom);
716
717 // Use XORP to simulate FNEG.
718 setOperationAction(ISD::FNEG , MVT::f32, Custom);
719
720 if (UseX87)
721 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
722
723 // Use ANDPS and ORPS to simulate FCOPYSIGN.
724 if (UseX87)
725 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
726 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
727
728 // We don't support sin/cos/fmod
729 setOperationAction(ISD::FSIN , MVT::f32, Expand);
730 setOperationAction(ISD::FCOS , MVT::f32, Expand);
731 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
732
733 if (UseX87) {
734 // Always expand sin/cos functions even though x87 has an instruction.
735 setOperationAction(ISD::FSIN, MVT::f64, Expand);
736 setOperationAction(ISD::FCOS, MVT::f64, Expand);
737 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
738 }
739 } else if (UseX87) {
740 // f32 and f64 in x87.
741 // Set up the FP register classes.
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
744
745 for (auto VT : { MVT::f32, MVT::f64 }) {
746 setOperationAction(ISD::UNDEF, VT, Expand);
747 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
748
749 // Always expand sin/cos functions even though x87 has an instruction.
750 setOperationAction(ISD::FSIN , VT, Expand);
751 setOperationAction(ISD::FCOS , VT, Expand);
752 setOperationAction(ISD::FSINCOS, VT, Expand);
753 }
754 }
755
756 // Expand FP32 immediates into loads from the stack, save special cases.
757 if (isTypeLegal(MVT::f32)) {
758 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
759 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
760 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
761 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
762 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
763 } else // SSE immediates.
764 addLegalFPImmediate(APFloat(+0.0f)); // xorps
765 }
766 // Expand FP64 immediates into loads from the stack, save special cases.
767 if (isTypeLegal(MVT::f64)) {
768 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
769 addLegalFPImmediate(APFloat(+0.0)); // FLD0
770 addLegalFPImmediate(APFloat(+1.0)); // FLD1
771 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
772 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
773 } else // SSE immediates.
774 addLegalFPImmediate(APFloat(+0.0)); // xorpd
775 }
776 // Support fp16 0 immediate.
777 if (isTypeLegal(MVT::f16))
778 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
779
780 // Handle constrained floating-point operations of scalar.
781 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
782 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
783 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
784 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
785 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
786 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
787 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
788 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
789 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
790 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
791 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
792 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
793
794 // We don't support FMA.
795 setOperationAction(ISD::FMA, MVT::f64, Expand);
796 setOperationAction(ISD::FMA, MVT::f32, Expand);
797
798 // f80 always uses X87.
799 if (UseX87) {
800 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
801 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
802 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
803 {
804 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
805 addLegalFPImmediate(TmpFlt); // FLD0
806 TmpFlt.changeSign();
807 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
808
809 bool ignored;
810 APFloat TmpFlt2(+1.0);
811 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
812 &ignored);
813 addLegalFPImmediate(TmpFlt2); // FLD1
814 TmpFlt2.changeSign();
815 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
816 }
817
818 // Always expand sin/cos functions even though x87 has an instruction.
819 setOperationAction(ISD::FSIN , MVT::f80, Expand);
820 setOperationAction(ISD::FCOS , MVT::f80, Expand);
821 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
822
823 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
824 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
825 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
826 setOperationAction(ISD::FRINT, MVT::f80, Expand);
827 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
828 setOperationAction(ISD::FMA, MVT::f80, Expand);
829 setOperationAction(ISD::LROUND, MVT::f80, Expand);
830 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
831 setOperationAction(ISD::LRINT, MVT::f80, Custom);
832 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
833
834 // Handle constrained floating-point operations of scalar.
835 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
836 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
837 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
838 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
839 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
840 if (isTypeLegal(MVT::f16)) {
841 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
842 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
843 } else {
844 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
845 }
846 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
847 // as Custom.
848 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
849 }
850
851 // f128 uses xmm registers, but most operations require libcalls.
852 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
853 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
854 : &X86::VR128RegClass);
855
856 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
857
858 setOperationAction(ISD::FADD, MVT::f128, LibCall);
859 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
860 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
861 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
862 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
863 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
864 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
865 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
866 setOperationAction(ISD::FMA, MVT::f128, LibCall);
867 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
868
869 setOperationAction(ISD::FABS, MVT::f128, Custom);
870 setOperationAction(ISD::FNEG, MVT::f128, Custom);
871 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
872
873 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
874 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
875 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
876 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
877 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
878 // No STRICT_FSINCOS
879 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
880 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
881
882 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
883 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
884 // We need to custom handle any FP_ROUND with an f128 input, but
885 // LegalizeDAG uses the result type to know when to run a custom handler.
886 // So we have to list all legal floating point result types here.
887 if (isTypeLegal(MVT::f32)) {
888 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
889 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
890 }
891 if (isTypeLegal(MVT::f64)) {
892 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
893 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
894 }
895 if (isTypeLegal(MVT::f80)) {
896 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
897 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
898 }
899
900 setOperationAction(ISD::SETCC, MVT::f128, Custom);
901
902 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
903 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
904 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
905 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
906 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
907 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
908 }
909
910 // Always use a library call for pow.
911 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
912 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
913 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
914 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
915
916 setOperationAction(ISD::FLOG, MVT::f80, Expand);
917 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
918 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
919 setOperationAction(ISD::FEXP, MVT::f80, Expand);
920 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
921 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
922 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
923
924 // Some FP actions are always expanded for vector types.
925 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
926 MVT::v4f32, MVT::v8f32, MVT::v16f32,
927 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
928 setOperationAction(ISD::FSIN, VT, Expand);
929 setOperationAction(ISD::FSINCOS, VT, Expand);
930 setOperationAction(ISD::FCOS, VT, Expand);
931 setOperationAction(ISD::FREM, VT, Expand);
932 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
933 setOperationAction(ISD::FPOW, VT, Expand);
934 setOperationAction(ISD::FLOG, VT, Expand);
935 setOperationAction(ISD::FLOG2, VT, Expand);
936 setOperationAction(ISD::FLOG10, VT, Expand);
937 setOperationAction(ISD::FEXP, VT, Expand);
938 setOperationAction(ISD::FEXP2, VT, Expand);
939 }
940
941 // First set operation action for all vector types to either promote
942 // (for widening) or expand (for scalarization). Then we will selectively
943 // turn on ones that can be effectively codegen'd.
944 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
945 setOperationAction(ISD::SDIV, VT, Expand);
946 setOperationAction(ISD::UDIV, VT, Expand);
947 setOperationAction(ISD::SREM, VT, Expand);
948 setOperationAction(ISD::UREM, VT, Expand);
949 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
950 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
951 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
952 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
953 setOperationAction(ISD::FMA, VT, Expand);
954 setOperationAction(ISD::FFLOOR, VT, Expand);
955 setOperationAction(ISD::FCEIL, VT, Expand);
956 setOperationAction(ISD::FTRUNC, VT, Expand);
957 setOperationAction(ISD::FRINT, VT, Expand);
958 setOperationAction(ISD::FNEARBYINT, VT, Expand);
959 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
960 setOperationAction(ISD::MULHS, VT, Expand);
961 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
962 setOperationAction(ISD::MULHU, VT, Expand);
963 setOperationAction(ISD::SDIVREM, VT, Expand);
964 setOperationAction(ISD::UDIVREM, VT, Expand);
965 setOperationAction(ISD::CTPOP, VT, Expand);
966 setOperationAction(ISD::CTTZ, VT, Expand);
967 setOperationAction(ISD::CTLZ, VT, Expand);
968 setOperationAction(ISD::ROTL, VT, Expand);
969 setOperationAction(ISD::ROTR, VT, Expand);
970 setOperationAction(ISD::BSWAP, VT, Expand);
971 setOperationAction(ISD::SETCC, VT, Expand);
972 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
973 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
974 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
975 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
976 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
977 setOperationAction(ISD::TRUNCATE, VT, Expand);
978 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
979 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
980 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
981 setOperationAction(ISD::SELECT_CC, VT, Expand);
982 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
983 setTruncStoreAction(InnerVT, VT, Expand);
984
985 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
986 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
987
988 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
989 // types, we have to deal with them whether we ask for Expansion or not.
990 // Setting Expand causes its own optimisation problems though, so leave
991 // them legal.
992 if (VT.getVectorElementType() == MVT::i1)
993 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
994
995 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
996 // split/scalarized right now.
997 if (VT.getVectorElementType() == MVT::f16 ||
998 VT.getVectorElementType() == MVT::bf16)
999 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1000 }
1001 }
1002
1003 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1004 // with -msoft-float, disable use of MMX as well.
1005 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1006 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1007 // No operations on x86mmx supported, everything uses intrinsics.
1008 }
1009
1010 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1011 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1012 : &X86::VR128RegClass);
1013
1014 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1015 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1016
1017 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1018 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1019 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
1020 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
1021 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
1022 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1023 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1024 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
1025
1026 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1027 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1028
1029 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1030 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1031 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1032 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1033 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1034 }
1035
1036 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1037 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1038 : &X86::VR128RegClass);
1039
1040 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1041 // registers cannot be used even for integer operations.
1042 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1043 : &X86::VR128RegClass);
1044 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1045 : &X86::VR128RegClass);
1046 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1047 : &X86::VR128RegClass);
1048 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1049 : &X86::VR128RegClass);
1050 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1051 : &X86::VR128RegClass);
1052
1053 setOperationAction(ISD::FMAXIMUM, MVT::f64, Custom);
1054 setOperationAction(ISD::FMINIMUM, MVT::f64, Custom);
1055
1056 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1057 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1058 setOperationAction(ISD::SDIV, VT, Custom);
1059 setOperationAction(ISD::SREM, VT, Custom);
1060 setOperationAction(ISD::UDIV, VT, Custom);
1061 setOperationAction(ISD::UREM, VT, Custom);
1062 }
1063
1064 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1065 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1066 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1067
1068 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1069 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1070 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1071 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1072 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1073 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1074 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1075 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1076 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1077 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1078 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
1079 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
1080
1081 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1082 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1083 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1084
1085 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1086 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1087 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
1088
1089 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1090 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1091 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1092 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1093 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1094 }
1095
1096 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1097 setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
1098 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1099
1100 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
1101 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
1102 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
1103 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
1104 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
1105 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
1106 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
1107 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
1108 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
1109 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
1110
1111 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1112 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1113 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1114 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1115
1116 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1117 setOperationAction(ISD::SETCC, VT, Custom);
1118 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1119 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1120 setOperationAction(ISD::CTPOP, VT, Custom);
1121 setOperationAction(ISD::ABS, VT, Custom);
1122
1123 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1124 // setcc all the way to isel and prefer SETGT in some isel patterns.
1125 setCondCodeAction(ISD::SETLT, VT, Custom);
1126 setCondCodeAction(ISD::SETLE, VT, Custom);
1127 }
1128
1129 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1130 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1131 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1132 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1133 setOperationAction(ISD::VSELECT, VT, Custom);
1134 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1135 }
1136
1137 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1138 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1139 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1140 setOperationAction(ISD::VSELECT, VT, Custom);
1141
1142 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1143 continue;
1144
1145 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1146 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1147 }
1148 setF16Action(MVT::v8f16, Expand);
1149 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1150 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1151 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1152 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1153
1154 // Custom lower v2i64 and v2f64 selects.
1155 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1156 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1157 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1158 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1159 setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
1160 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1161
1162 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
1163 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1164 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1165 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1166 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
1167 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1168
1169 // Custom legalize these to avoid over promotion or custom promotion.
1170 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1171 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1172 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1173 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1174 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1175 }
1176
1177 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1178 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
1179 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1180 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1181
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1183 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1184
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1186 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1187
1188 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1189 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1190 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1191 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1192 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1193
1194 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1195 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1196 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1197 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1198
1199 // We want to legalize this to an f64 load rather than an i64 load on
1200 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1201 // store.
1202 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1203 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1204 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1205 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1206 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1207 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1208
1209 // Add 32-bit vector stores to help vectorization opportunities.
1210 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1211 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1212
1213 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1214 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1215 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1216 if (!Subtarget.hasAVX512())
1217 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1218
1219 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1220 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1221 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1222
1223 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1224
1225 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1226 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1227 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1228 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1229 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1230 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1231
1232 // In the customized shift lowering, the legal v4i32/v2i64 cases
1233 // in AVX2 will be recognized.
1234 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1235 setOperationAction(ISD::SRL, VT, Custom);
1236 setOperationAction(ISD::SHL, VT, Custom);
1237 setOperationAction(ISD::SRA, VT, Custom);
1238 if (VT == MVT::v2i64) continue;
1239 setOperationAction(ISD::ROTL, VT, Custom);
1240 setOperationAction(ISD::ROTR, VT, Custom);
1241 setOperationAction(ISD::FSHL, VT, Custom);
1242 setOperationAction(ISD::FSHR, VT, Custom);
1243 }
1244
1245 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1246 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1247 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1248 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1249 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1250 }
1251
1252 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1253 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1254 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1255 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1256 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1257 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1258 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1259 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1260 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1261
1262 // These might be better off as horizontal vector ops.
1263 setOperationAction(ISD::ADD, MVT::i16, Custom);
1264 setOperationAction(ISD::ADD, MVT::i32, Custom);
1265 setOperationAction(ISD::SUB, MVT::i16, Custom);
1266 setOperationAction(ISD::SUB, MVT::i32, Custom);
1267 }
1268
1269 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1270 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1271 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1272 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1273 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1274 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1275 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1276 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1277 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1278 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1279 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1280 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1281 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1282 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1283
1284 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1285 }
1286
1287 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1288 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1289 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1290 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1291 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1292 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1293 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1294 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1295
1296 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1297 setOperationAction(ISD::ABDS, VT, Custom);
1298 setOperationAction(ISD::ABDU, VT, Custom);
1299 }
1300
1301 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1302 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1303 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1304
1305 // FIXME: Do we need to handle scalar-to-vector here?
1306 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1307 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1308
1309 // We directly match byte blends in the backend as they match the VSELECT
1310 // condition form.
1311 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1312
1313 // SSE41 brings specific instructions for doing vector sign extend even in
1314 // cases where we don't have SRA.
1315 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1316 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1317 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1318 }
1319
1320 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1321 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1322 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1323 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1324 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1325 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1326 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1327 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1328 }
1329
1330 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1331 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1332 // do the pre and post work in the vector domain.
1333 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1334 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1335 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1336 // so that DAG combine doesn't try to turn it into uint_to_fp.
1337 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1338 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1339 }
1340 }
1341
1342 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1343 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1344 }
1345
1346 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1347 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1348 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1349 setOperationAction(ISD::ROTL, VT, Custom);
1350 setOperationAction(ISD::ROTR, VT, Custom);
1351 }
1352
1353 // XOP can efficiently perform BITREVERSE with VPPERM.
1354 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1355 setOperationAction(ISD::BITREVERSE, VT, Custom);
1356
1357 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1358 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1359 setOperationAction(ISD::BITREVERSE, VT, Custom);
1360 }
1361
1362 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1363 bool HasInt256 = Subtarget.hasInt256();
1364
1365 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1366 : &X86::VR256RegClass);
1367 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1368 : &X86::VR256RegClass);
1369 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1370 : &X86::VR256RegClass);
1371 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1372 : &X86::VR256RegClass);
1373 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1374 : &X86::VR256RegClass);
1375 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1376 : &X86::VR256RegClass);
1377 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1378 : &X86::VR256RegClass);
1379
1380 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1381 setOperationAction(ISD::FFLOOR, VT, Legal);
1382 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1383 setOperationAction(ISD::FCEIL, VT, Legal);
1384 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1385 setOperationAction(ISD::FTRUNC, VT, Legal);
1386 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1387 setOperationAction(ISD::FRINT, VT, Legal);
1388 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1389 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1390 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1391 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1392 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1393
1394 setOperationAction(ISD::FROUND, VT, Custom);
1395
1396 setOperationAction(ISD::FNEG, VT, Custom);
1397 setOperationAction(ISD::FABS, VT, Custom);
1398 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1399 }
1400
1401 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1402 // even though v8i16 is a legal type.
1403 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1404 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1405 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1406 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1407 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
1408 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1409 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
1410
1411 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
1412 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
1413 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1414 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
1415 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1416 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
1417
1418 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1419 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1420 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1421 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1422 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1423 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1424 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1425 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1426 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1427 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1428 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1429
1430 if (!Subtarget.hasAVX512())
1431 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1432
1433 // In the customized shift lowering, the legal v8i32/v4i64 cases
1434 // in AVX2 will be recognized.
1435 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1436 setOperationAction(ISD::SRL, VT, Custom);
1437 setOperationAction(ISD::SHL, VT, Custom);
1438 setOperationAction(ISD::SRA, VT, Custom);
1439 setOperationAction(ISD::ABDS, VT, Custom);
1440 setOperationAction(ISD::ABDU, VT, Custom);
1441 if (VT == MVT::v4i64) continue;
1442 setOperationAction(ISD::ROTL, VT, Custom);
1443 setOperationAction(ISD::ROTR, VT, Custom);
1444 setOperationAction(ISD::FSHL, VT, Custom);
1445 setOperationAction(ISD::FSHR, VT, Custom);
1446 }
1447
1448 // These types need custom splitting if their input is a 128-bit vector.
1449 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1450 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1451 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1452 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1453
1454 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1455 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1456 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1457 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1458 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1459 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1460 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1461
1462 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1463 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1464 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1465 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1466 }
1467
1468 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1469 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1470 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1471 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1472
1473 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1474 setOperationAction(ISD::SETCC, VT, Custom);
1475 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1476 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1477 setOperationAction(ISD::CTPOP, VT, Custom);
1478 setOperationAction(ISD::CTLZ, VT, Custom);
1479
1480 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1481 // setcc all the way to isel and prefer SETGT in some isel patterns.
1482 setCondCodeAction(ISD::SETLT, VT, Custom);
1483 setCondCodeAction(ISD::SETLE, VT, Custom);
1484 }
1485
1486 if (Subtarget.hasAnyFMA()) {
1487 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1488 MVT::v2f64, MVT::v4f64 }) {
1489 setOperationAction(ISD::FMA, VT, Legal);
1490 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1491 }
1492 }
1493
1494 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1495 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1496 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1497 }
1498
1499 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1500 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1501 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1502 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1503
1504 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1505 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1506 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1507 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1508 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1509 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1510 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1511 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1512
1513 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1514 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1515
1516 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1517 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1518 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1519 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1520 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1521
1522 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1523 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1524 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1525 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1526 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1527 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1528 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1529 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1530 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1531 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1532 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1533 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1534
1535 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1536 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1537 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1538 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1539 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1540 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1541 }
1542
1543 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1544 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1545 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1546 }
1547
1548 if (HasInt256) {
1549 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1550 // when we have a 256bit-wide blend with immediate.
1551 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1552 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1553
1554 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1555 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1556 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1557 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1558 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1559 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1560 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1561 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1562 }
1563 }
1564
1565 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1566 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1567 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1568 setOperationAction(ISD::MSTORE, VT, Legal);
1569 }
1570
1571 // Extract subvector is special because the value type
1572 // (result) is 128-bit but the source is 256-bit wide.
1573 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1574 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1575 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1576 }
1577
1578 // Custom lower several nodes for 256-bit types.
1579 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1580 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1581 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1582 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1583 setOperationAction(ISD::VSELECT, VT, Custom);
1584 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1585 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1586 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1587 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1588 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1589 setOperationAction(ISD::STORE, VT, Custom);
1590 }
1591 setF16Action(MVT::v16f16, Expand);
1592 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1593 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1594 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1595 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1596
1597 if (HasInt256) {
1598 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1599
1600 // Custom legalize 2x32 to get a little better code.
1601 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1602 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1603
1604 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1605 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1606 setOperationAction(ISD::MGATHER, VT, Custom);
1607 }
1608 }
1609
1610 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1611 Subtarget.hasF16C()) {
1612 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1613 setOperationAction(ISD::FP_ROUND, VT, Custom);
1614 setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1615 }
1616 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
1617 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1618 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
1619 }
1620 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1621 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1622 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1623 }
1624
1625 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1626 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
1627 }
1628
1629 // This block controls legalization of the mask vector sizes that are
1630 // available with AVX512. 512-bit vectors are in a separate block controlled
1631 // by useAVX512Regs.
1632 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1633 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1634 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1635 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1636 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1637 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1638
1639 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1640 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1641 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1642
1643 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1644 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1645 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1646 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1647 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1648 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1649 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1650 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1651 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1652 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1653 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1654 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1655
1656 // There is no byte sized k-register load or store without AVX512DQ.
1657 if (!Subtarget.hasDQI()) {
1658 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1659 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1660 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1661 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1662
1663 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1664 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1665 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1666 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1667 }
1668
1669 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1670 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1671 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1672 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1673 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1674 }
1675
1676 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1677 setOperationAction(ISD::VSELECT, VT, Expand);
1678
1679 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1680 setOperationAction(ISD::SETCC, VT, Custom);
1681 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1682 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1683 setOperationAction(ISD::SELECT, VT, Custom);
1684 setOperationAction(ISD::TRUNCATE, VT, Custom);
1685
1686 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1687 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1688 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1689 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1691 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1692 }
1693
1694 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1695 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1696 }
1697
1698 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1699 // elements. 512-bits can be disabled based on prefer-vector-width and
1700 // required-vector-width function attributes.
1701 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1702 bool HasBWI = Subtarget.hasBWI();
1703
1704 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1705 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1706 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1707 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1708 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1709 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1710 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1711
1712 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1713 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1714 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1715 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1716 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1717 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1718 if (HasBWI)
1719 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1720 }
1721
1722 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1723 setOperationAction(ISD::FNEG, VT, Custom);
1724 setOperationAction(ISD::FABS, VT, Custom);
1725 setOperationAction(ISD::FMA, VT, Legal);
1726 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1727 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1728 }
1729
1730 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1731 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1732 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1733 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1734 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1735 }
1736
1737 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1738 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1739 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1740 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1741 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1742 }
1743
1744 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
1745 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
1746 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1747 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1748 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1749 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
1750
1751 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1752 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1753 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1754 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1755 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1756 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1757 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1758 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1759 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1760 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1761 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1762
1763 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1764 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1765 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1766 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1767 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1768 if (HasBWI)
1769 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1770
1771 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1772 // to 512-bit rather than use the AVX2 instructions so that we can use
1773 // k-masks.
1774 if (!Subtarget.hasVLX()) {
1775 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1776 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1777 setOperationAction(ISD::MLOAD, VT, Custom);
1778 setOperationAction(ISD::MSTORE, VT, Custom);
1779 }
1780 }
1781
1782 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1783 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1784 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1785 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1786 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1787 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1788 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1789 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1790 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1791 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1792 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1793 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1794 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1795
1796 if (HasBWI) {
1797 // Extends from v64i1 masks to 512-bit vectors.
1798 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1799 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1800 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1801 }
1802
1803 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1804 setOperationAction(ISD::FFLOOR, VT, Legal);
1805 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1806 setOperationAction(ISD::FCEIL, VT, Legal);
1807 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1808 setOperationAction(ISD::FTRUNC, VT, Legal);
1809 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1810 setOperationAction(ISD::FRINT, VT, Legal);
1811 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1812 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1813 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1814 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1815 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1816
1817 setOperationAction(ISD::FROUND, VT, Custom);
1818 }
1819
1820 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1821 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1822 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1823 }
1824
1825 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1826 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1827 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1828 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1829
1830 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1831 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1832 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1833 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1834
1835 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1836 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1837 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1838 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1839 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1840 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1841 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1842 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1843
1844 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1845 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1846
1847 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1848
1849 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1850 setOperationAction(ISD::SRL, VT, Custom);
1851 setOperationAction(ISD::SHL, VT, Custom);
1852 setOperationAction(ISD::SRA, VT, Custom);
1853 setOperationAction(ISD::ROTL, VT, Custom);
1854 setOperationAction(ISD::ROTR, VT, Custom);
1855 setOperationAction(ISD::SETCC, VT, Custom);
1856 setOperationAction(ISD::ABDS, VT, Custom);
1857 setOperationAction(ISD::ABDU, VT, Custom);
1858
1859 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1860 // setcc all the way to isel and prefer SETGT in some isel patterns.
1861 setCondCodeAction(ISD::SETLT, VT, Custom);
1862 setCondCodeAction(ISD::SETLE, VT, Custom);
1863 }
1864 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1865 setOperationAction(ISD::SMAX, VT, Legal);
1866 setOperationAction(ISD::UMAX, VT, Legal);
1867 setOperationAction(ISD::SMIN, VT, Legal);
1868 setOperationAction(ISD::UMIN, VT, Legal);
1869 setOperationAction(ISD::ABS, VT, Legal);
1870 setOperationAction(ISD::CTPOP, VT, Custom);
1871 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1872 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1873 }
1874
1875 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1876 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1877 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1878 setOperationAction(ISD::CTLZ, VT, Custom);
1879 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1880 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1881 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1882 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1883 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1884 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1885 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1886 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1887 }
1888
1889 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1890 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1891 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1892 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1893 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1894 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1895
1896 if (Subtarget.hasDQI()) {
1897 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1898 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1899 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1900 setOperationAction(Opc, MVT::v8i64, Custom);
1901 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1902 }
1903
1904 if (Subtarget.hasCDI()) {
1905 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1906 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1907 setOperationAction(ISD::CTLZ, VT, Legal);
1908 }
1909 } // Subtarget.hasCDI()
1910
1911 if (Subtarget.hasVPOPCNTDQ()) {
1912 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1913 setOperationAction(ISD::CTPOP, VT, Legal);
1914 }
1915
1916 // Extract subvector is special because the value type
1917 // (result) is 256-bit but the source is 512-bit wide.
1918 // 128-bit was made Legal under AVX1.
1919 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1920 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1921 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1922
1923 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1924 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1925 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1926 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1927 setOperationAction(ISD::SELECT, VT, Custom);
1928 setOperationAction(ISD::VSELECT, VT, Custom);
1929 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1930 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1931 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1932 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1933 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1934 }
1935 setF16Action(MVT::v32f16, Expand);
1936 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1937 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1938 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
1939 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1940 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1941 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1942 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1943 }
1944
1945 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1946 setOperationAction(ISD::MLOAD, VT, Legal);
1947 setOperationAction(ISD::MSTORE, VT, Legal);
1948 setOperationAction(ISD::MGATHER, VT, Custom);
1949 setOperationAction(ISD::MSCATTER, VT, Custom);
1950 }
1951 if (HasBWI) {
1952 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1953 setOperationAction(ISD::MLOAD, VT, Legal);
1954 setOperationAction(ISD::MSTORE, VT, Legal);
1955 }
1956 } else {
1957 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1958 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1959 }
1960
1961 if (Subtarget.hasVBMI2()) {
1962 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1963 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1964 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965 setOperationAction(ISD::FSHL, VT, Custom);
1966 setOperationAction(ISD::FSHR, VT, Custom);
1967 }
1968
1969 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1970 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1971 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1972 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1973 }
1974 }// useAVX512Regs
1975
1976 // This block controls legalization for operations that don't have
1977 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1978 // narrower widths.
1979 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1980 // These operations are handled on non-VLX by artificially widening in
1981 // isel patterns.
1982
1983 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
1984 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
1985 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1986
1987 if (Subtarget.hasDQI()) {
1988 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1989 // v2f32 UINT_TO_FP is already custom under SSE2.
1990 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))
1991 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))
1992 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))
;
1993 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1994 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1995 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1996 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1997 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1998 }
1999
2000 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2001 setOperationAction(ISD::SMAX, VT, Legal);
2002 setOperationAction(ISD::UMAX, VT, Legal);
2003 setOperationAction(ISD::SMIN, VT, Legal);
2004 setOperationAction(ISD::UMIN, VT, Legal);
2005 setOperationAction(ISD::ABS, VT, Legal);
2006 }
2007
2008 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2009 setOperationAction(ISD::ROTL, VT, Custom);
2010 setOperationAction(ISD::ROTR, VT, Custom);
2011 }
2012
2013 // Custom legalize 2x32 to get a little better code.
2014 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2015 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2016
2017 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2018 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2019 setOperationAction(ISD::MSCATTER, VT, Custom);
2020
2021 if (Subtarget.hasDQI()) {
2022 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
2023 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
2024 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
2025 setOperationAction(Opc, MVT::v2i64, Custom);
2026 setOperationAction(Opc, MVT::v4i64, Custom);
2027 }
2028 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2029 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2030 }
2031
2032 if (Subtarget.hasCDI()) {
2033 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2034 setOperationAction(ISD::CTLZ, VT, Legal);
2035 }
2036 } // Subtarget.hasCDI()
2037
2038 if (Subtarget.hasVPOPCNTDQ()) {
2039 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2040 setOperationAction(ISD::CTPOP, VT, Legal);
2041 }
2042 }
2043
2044 // This block control legalization of v32i1/v64i1 which are available with
2045 // AVX512BW..
2046 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2047 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2048 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2049
2050 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2051 setOperationAction(ISD::VSELECT, VT, Expand);
2052 setOperationAction(ISD::TRUNCATE, VT, Custom);
2053 setOperationAction(ISD::SETCC, VT, Custom);
2054 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2055 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
2056 setOperationAction(ISD::SELECT, VT, Custom);
2057 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2058 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2059 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
2060 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
2061 }
2062
2063 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2064 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2065
2066 // Extends from v32i1 masks to 256-bit vectors.
2067 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
2068 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
2069 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
2070
2071 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2072 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2073 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2074 }
2075
2076 // These operations are handled on non-VLX by artificially widening in
2077 // isel patterns.
2078 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2079
2080 if (Subtarget.hasBITALG()) {
2081 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2082 setOperationAction(ISD::CTPOP, VT, Legal);
2083 }
2084 }
2085
2086 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2087 auto setGroup = [&] (MVT VT) {
2088 setOperationAction(ISD::FADD, VT, Legal);
2089 setOperationAction(ISD::STRICT_FADD, VT, Legal);
2090 setOperationAction(ISD::FSUB, VT, Legal);
2091 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
2092 setOperationAction(ISD::FMUL, VT, Legal);
2093 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
2094 setOperationAction(ISD::FDIV, VT, Legal);
2095 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
2096 setOperationAction(ISD::FSQRT, VT, Legal);
2097 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
2098
2099 setOperationAction(ISD::FFLOOR, VT, Legal);
2100 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
2101 setOperationAction(ISD::FCEIL, VT, Legal);
2102 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
2103 setOperationAction(ISD::FTRUNC, VT, Legal);
2104 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
2105 setOperationAction(ISD::FRINT, VT, Legal);
2106 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
2107 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2108 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
2109
2110 setOperationAction(ISD::FROUND, VT, Custom);
2111
2112 setOperationAction(ISD::LOAD, VT, Legal);
2113 setOperationAction(ISD::STORE, VT, Legal);
2114
2115 setOperationAction(ISD::FMA, VT, Legal);
2116 setOperationAction(ISD::STRICT_FMA, VT, Legal);
2117 setOperationAction(ISD::VSELECT, VT, Legal);
2118 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2119 setOperationAction(ISD::SELECT, VT, Custom);
2120
2121 setOperationAction(ISD::FNEG, VT, Custom);
2122 setOperationAction(ISD::FABS, VT, Custom);
2123 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
2124 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2125 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2126 };
2127
2128 // AVX512_FP16 scalar operations
2129 setGroup(MVT::f16);
2130 setOperationAction(ISD::FREM, MVT::f16, Promote);
2131 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
2132 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
2133 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2134 setOperationAction(ISD::SETCC, MVT::f16, Custom);
2135 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
2136 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
2137 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
2138 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2139 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
2140 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
2141 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
2142 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2143 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2144 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2145 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
2146
2147 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2148 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2149
2150 if (Subtarget.useAVX512Regs()) {
2151 setGroup(MVT::v32f16);
2152 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
2153 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
2154 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
2155 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
2156 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
2157 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2158 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
2159 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
2160 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
2161 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);
2162 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
2163 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
2164
2165 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
2166 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
2167 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
2168 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
2169 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2170 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2171 MVT::v32i16);
2172 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2173 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2174 MVT::v32i16);
2175 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2176 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2177 MVT::v32i16);
2178 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2179 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2180 MVT::v32i16);
2181
2182 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2183 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2184 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2185
2186 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2187 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2188
2189 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2190 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2191 }
2192
2193 if (Subtarget.hasVLX()) {
2194 setGroup(MVT::v8f16);
2195 setGroup(MVT::v16f16);
2196
2197 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2198 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2199 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2200 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2201 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2202 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2203 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2204 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2205 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2206 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2207
2208 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2209 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2210 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2211 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2212 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
2213 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2214 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
2215 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2216 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
2217 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
2218
2219 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2220 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2221 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2222
2223 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2224 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2225 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2226
2227 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2228 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2229 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2230 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2231
2232 // Need to custom widen these to prevent scalarization.
2233 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2234 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2235 }
2236 }
2237
2238 if (!Subtarget.useSoftFloat() &&
2239 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2240 addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
2241 addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
2242 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2243 // provide the method to promote BUILD_VECTOR. Set the operation action
2244 // Custom to do the customization later.
2245 setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2246 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2247 setF16Action(VT, Expand);
2248 setOperationAction(ISD::FADD, VT, Expand);
2249 setOperationAction(ISD::FSUB, VT, Expand);
2250 setOperationAction(ISD::FMUL, VT, Expand);
2251 setOperationAction(ISD::FDIV, VT, Expand);
2252 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2253 }
2254 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2255 }
2256
2257 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2258 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2259 setF16Action(MVT::v32bf16, Expand);
2260 setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2261 setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2262 setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2263 setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2264 setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2265 }
2266
2267 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2268 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2269 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2270 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2271 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2272 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2273
2274 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2275 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2276 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2277 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2278 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2279
2280 if (Subtarget.hasBWI()) {
2281 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2282 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2283 }
2284
2285 if (Subtarget.hasFP16()) {
2286 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2287 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2288 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2289 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2290 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2291 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2292 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2293 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2294 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2295 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2296 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2297 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2298 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2299 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2300 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2301 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2302 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2303 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2304 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2305 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2306 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2307 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2308 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2309 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2310 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2311 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2312 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2313 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2314 }
2315
2316 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2317 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2318 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2319 }
2320
2321 if (Subtarget.hasAMXTILE()) {
2322 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2323 }
2324
2325 // We want to custom lower some of our intrinsics.
2326 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2327 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2328 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2329 if (!Subtarget.is64Bit()) {
2330 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2331 }
2332
2333 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2334 // handle type legalization for these operations here.
2335 //
2336 // FIXME: We really should do custom legalization for addition and
2337 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2338 // than generic legalization for 64-bit multiplication-with-overflow, though.
2339 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2340 if (VT == MVT::i64 && !Subtarget.is64Bit())
2341 continue;
2342 // Add/Sub/Mul with overflow operations are custom lowered.
2343 setOperationAction(ISD::SADDO, VT, Custom);
2344 setOperationAction(ISD::UADDO, VT, Custom);
2345 setOperationAction(ISD::SSUBO, VT, Custom);
2346 setOperationAction(ISD::USUBO, VT, Custom);
2347 setOperationAction(ISD::SMULO, VT, Custom);
2348 setOperationAction(ISD::UMULO, VT, Custom);
2349
2350 // Support carry in as value rather than glue.
2351 setOperationAction(ISD::UADDO_CARRY, VT, Custom);
2352 setOperationAction(ISD::USUBO_CARRY, VT, Custom);
2353 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2354 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2355 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2356 }
2357
2358 if (!Subtarget.is64Bit()) {
2359 // These libcalls are not available in 32-bit.
2360 setLibcallName(RTLIB::SHL_I128, nullptr);
2361 setLibcallName(RTLIB::SRL_I128, nullptr);
2362 setLibcallName(RTLIB::SRA_I128, nullptr);
2363 setLibcallName(RTLIB::MUL_I128, nullptr);
2364 // The MULO libcall is not part of libgcc, only compiler-rt.
2365 setLibcallName(RTLIB::MULO_I64, nullptr);
2366 }
2367 // The MULO libcall is not part of libgcc, only compiler-rt.
2368 setLibcallName(RTLIB::MULO_I128, nullptr);
2369
2370 // Combine sin / cos into _sincos_stret if it is available.
2371 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2372 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2373 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2374 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2375 }
2376
2377 if (Subtarget.isTargetWin64()) {
2378 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2379 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2380 setOperationAction(ISD::SREM, MVT::i128, Custom);
2381 setOperationAction(ISD::UREM, MVT::i128, Custom);
2382 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2383 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2384 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2385 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2386 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2387 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2388 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2389 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2390 }
2391
2392 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2393 // is. We should promote the value to 64-bits to solve this.
2394 // This is what the CRT headers do - `fmodf` is an inline header
2395 // function casting to f64 and calling `fmod`.
2396 if (Subtarget.is32Bit() &&
2397 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2398 for (ISD::NodeType Op :
2399 {ISD::FCEIL, ISD::STRICT_FCEIL,
2400 ISD::FCOS, ISD::STRICT_FCOS,
2401 ISD::FEXP, ISD::STRICT_FEXP,
2402 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2403 ISD::FREM, ISD::STRICT_FREM,
2404 ISD::FLOG, ISD::STRICT_FLOG,
2405 ISD::FLOG10, ISD::STRICT_FLOG10,
2406 ISD::FPOW, ISD::STRICT_FPOW,
2407 ISD::FSIN, ISD::STRICT_FSIN})
2408 if (isOperationExpand(Op, MVT::f32))
2409 setOperationAction(Op, MVT::f32, Promote);
2410
2411 // We have target-specific dag combine patterns for the following nodes:
2412 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2413 ISD::SCALAR_TO_VECTOR,
2414 ISD::INSERT_VECTOR_ELT,
2415 ISD::EXTRACT_VECTOR_ELT,
2416 ISD::CONCAT_VECTORS,
2417 ISD::INSERT_SUBVECTOR,
2418 ISD::EXTRACT_SUBVECTOR,
2419 ISD::BITCAST,
2420 ISD::VSELECT,
2421 ISD::SELECT,
2422 ISD::SHL,
2423 ISD::SRA,
2424 ISD::SRL,
2425 ISD::OR,
2426 ISD::AND,
2427 ISD::ADD,
2428 ISD::FADD,
2429 ISD::FSUB,
2430 ISD::FNEG,
2431 ISD::FMA,
2432 ISD::STRICT_FMA,
2433 ISD::FMINNUM,
2434 ISD::FMAXNUM,
2435 ISD::SUB,
2436 ISD::LOAD,
2437 ISD::MLOAD,
2438 ISD::STORE,
2439 ISD::MSTORE,
2440 ISD::TRUNCATE,
2441 ISD::ZERO_EXTEND,
2442 ISD::ANY_EXTEND,
2443 ISD::SIGN_EXTEND,
2444 ISD::SIGN_EXTEND_INREG,
2445 ISD::ANY_EXTEND_VECTOR_INREG,
2446 ISD::SIGN_EXTEND_VECTOR_INREG,
2447 ISD::ZERO_EXTEND_VECTOR_INREG,
2448 ISD::SINT_TO_FP,
2449 ISD::UINT_TO_FP,
2450 ISD::STRICT_SINT_TO_FP,
2451 ISD::STRICT_UINT_TO_FP,
2452 ISD::SETCC,
2453 ISD::MUL,
2454 ISD::XOR,
2455 ISD::MSCATTER,
2456 ISD::MGATHER,
2457 ISD::FP16_TO_FP,
2458 ISD::FP_EXTEND,
2459 ISD::STRICT_FP_EXTEND,
2460 ISD::FP_ROUND,
2461 ISD::STRICT_FP_ROUND});
2462
2463 computeRegisterProperties(Subtarget.getRegisterInfo());
2464
2465 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2466 MaxStoresPerMemsetOptSize = 8;
2467 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2468 MaxStoresPerMemcpyOptSize = 4;
2469 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2470 MaxStoresPerMemmoveOptSize = 4;
2471
2472 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2473 // that needs to benchmarked and balanced with the potential use of vector
2474 // load/store types (PR33329, PR33914).
2475 MaxLoadsPerMemcmp = 2;
2476 MaxLoadsPerMemcmpOptSize = 2;
2477
2478 // Default loop alignment, which can be overridden by -align-loops.
2479 setPrefLoopAlignment(Align(16));
2480
2481 // An out-of-order CPU can speculatively execute past a predictable branch,
2482 // but a conditional move could be stalled by an expensive earlier operation.
2483 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2484 EnableExtLdPromotion = true;
2485 setPrefFunctionAlignment(Align(16));
2486
2487 verifyIntrinsicTables();
2488
2489 // Default to having -disable-strictnode-mutation on
2490 IsStrictFPEnabled = true;
2491}
2492
2493// This has so far only been implemented for 64-bit MachO.
2494bool X86TargetLowering::useLoadStackGuardNode() const {
2495 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2496}
2497
2498bool X86TargetLowering::useStackGuardXorFP() const {
2499 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2500 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2501}
2502
2503SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2504 const SDLoc &DL) const {
2505 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2506 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2507 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2508 return SDValue(Node, 0);
2509}
2510
2511TargetLoweringBase::LegalizeTypeAction
2512X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2513 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2514 !Subtarget.hasBWI())
2515 return TypeSplitVector;
2516
2517 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2518 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2519 return TypeSplitVector;
2520
2521 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2522 VT.getVectorElementType() != MVT::i1)
2523 return TypeWidenVector;
2524
2525 return TargetLoweringBase::getPreferredVectorAction(VT);
2526}
2527
2528static std::pair<MVT, unsigned>
2529handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2530 const X86Subtarget &Subtarget) {
2531 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2532 // convention is one that uses k registers.
2533 if (NumElts == 2)
2534 return {MVT::v2i64, 1};
2535 if (NumElts == 4)
2536 return {MVT::v4i32, 1};
2537 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2538 CC != CallingConv::Intel_OCL_BI)
2539 return {MVT::v8i16, 1};
2540 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2541 CC != CallingConv::Intel_OCL_BI)
2542 return {MVT::v16i8, 1};
2543 // v32i1 passes in ymm unless we have BWI and the calling convention is
2544 // regcall.
2545 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2546 return {MVT::v32i8, 1};
2547 // Split v64i1 vectors if we don't have v64i8 available.
2548 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2549 if (Subtarget.useAVX512Regs())
2550 return {MVT::v64i8, 1};
2551 return {MVT::v32i8, 2};
2552 }
2553
2554 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2555 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2556 NumElts > 64)
2557 return {MVT::i8, NumElts};
2558
2559 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2560}
2561
2562MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2563 CallingConv::ID CC,
2564 EVT VT) const {
2565 if (VT.isVector()) {
2566 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2567 unsigned NumElts = VT.getVectorNumElements();
2568
2569 MVT RegisterVT;
2570 unsigned NumRegisters;
2571 std::tie(RegisterVT, NumRegisters) =
2572 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2573 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2574 return RegisterVT;
2575 }
2576
2577 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2578 return MVT::v8f16;
2579 }
2580
2581 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2582 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2583 !Subtarget.hasX87())
2584 return MVT::i32;
2585
2586 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2587 return getRegisterTypeForCallingConv(Context, CC,
2588 VT.changeVectorElementTypeToInteger());
2589
2590 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2591}
2592
2593unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2594 CallingConv::ID CC,
2595 EVT VT) const {
2596 if (VT.isVector()) {
2597 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2598 unsigned NumElts = VT.getVectorNumElements();
2599
2600 MVT RegisterVT;
2601 unsigned NumRegisters;
2602 std::tie(RegisterVT, NumRegisters) =
2603 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2604 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2605 return NumRegisters;
2606 }
2607
2608 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2609 return 1;
2610 }
2611
2612 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2613 // x87 is disabled.
2614 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2615 if (VT == MVT::f64)
2616 return 2;
2617 if (VT == MVT::f80)
2618 return 3;
2619 }
2620
2621 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2622 return getNumRegistersForCallingConv(Context, CC,
2623 VT.changeVectorElementTypeToInteger());
2624
2625 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2626}
2627
2628unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2629 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2630 unsigned &NumIntermediates, MVT &RegisterVT) const {
2631 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2632 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2633 Subtarget.hasAVX512() &&
2634 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2635 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2636 VT.getVectorNumElements() > 64)) {
2637 RegisterVT = MVT::i8;
2638 IntermediateVT = MVT::i1;
2639 NumIntermediates = VT.getVectorNumElements();
2640 return NumIntermediates;
2641 }
2642
2643 // Split v64i1 vectors if we don't have v64i8 available.
2644 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2645 CC != CallingConv::X86_RegCall) {
2646 RegisterVT = MVT::v32i8;
2647 IntermediateVT = MVT::v32i1;
2648 NumIntermediates = 2;
2649 return 2;
2650 }
2651
2652 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2653 NumIntermediates, RegisterVT);
2654}
2655
2656EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2657 LLVMContext& Context,
2658 EVT VT) const {
2659 if (!VT.isVector())
2660 return MVT::i8;
2661
2662 if (Subtarget.hasAVX512()) {
2663 // Figure out what this type will be legalized to.
2664 EVT LegalVT = VT;
2665 while (getTypeAction(Context, LegalVT) != TypeLegal)
2666 LegalVT = getTypeToTransformTo(Context, LegalVT);
2667
2668 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2669 if (LegalVT.getSimpleVT().is512BitVector())
2670 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2671
2672 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2673 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2674 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2675 // vXi16/vXi8.
2676 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2677 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2678 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2679 }
2680 }
2681
2682 return VT.changeVectorElementTypeToInteger();
2683}
2684
2685/// Helper for getByValTypeAlignment to determine
2686/// the desired ByVal argument alignment.
2687static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2688 if (MaxAlign == 16)
2689 return;
2690 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2691 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
2692 MaxAlign = Align(16);
2693 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2694 Align EltAlign;
2695 getMaxByValAlign(ATy->getElementType(), EltAlign);
2696 if (EltAlign > MaxAlign)
2697 MaxAlign = EltAlign;
2698 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2699 for (auto *EltTy : STy->elements()) {
2700 Align EltAlign;
2701 getMaxByValAlign(EltTy, EltAlign);
2702 if (EltAlign > MaxAlign)
2703 MaxAlign = EltAlign;
2704 if (MaxAlign == 16)
2705 break;
2706 }
2707 }
2708}
2709
2710/// Return the desired alignment for ByVal aggregate
2711/// function arguments in the caller parameter area. For X86, aggregates
2712/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2713/// are at 4-byte boundaries.
2714uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2715 const DataLayout &DL) const {
2716 if (Subtarget.is64Bit()) {
2717 // Max of 8 and alignment of type.
2718 Align TyAlign = DL.getABITypeAlign(Ty);
2719 if (TyAlign > 8)
2720 return TyAlign.value();
2721 return 8;
2722 }
2723
2724 Align Alignment(4);
2725 if (Subtarget.hasSSE1())
2726 getMaxByValAlign(Ty, Alignment);
2727 return Alignment.value();
2728}
2729
2730/// It returns EVT::Other if the type should be determined using generic
2731/// target-independent logic.
2732/// For vector ops we check that the overall size isn't larger than our
2733/// preferred vector width.
2734EVT X86TargetLowering::getOptimalMemOpType(
2735 const MemOp &Op, const AttributeList &FuncAttributes) const {
2736 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2737 if (Op.size() >= 16 &&
2738 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2739 // FIXME: Check if unaligned 64-byte accesses are slow.
2740 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2741 (Subtarget.getPreferVectorWidth() >= 512)) {
2742 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2743 }
2744 // FIXME: Check if unaligned 32-byte accesses are slow.
2745 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2746 Subtarget.useLight256BitInstructions()) {
2747 // Although this isn't a well-supported type for AVX1, we'll let
2748 // legalization and shuffle lowering produce the optimal codegen. If we
2749 // choose an optimal type with a vector element larger than a byte,
2750 // getMemsetStores() may create an intermediate splat (using an integer
2751 // multiply) before we splat as a vector.
2752 return MVT::v32i8;
2753 }
2754 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2755 return MVT::v16i8;
2756 // TODO: Can SSE1 handle a byte vector?
2757 // If we have SSE1 registers we should be able to use them.
2758 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2759 (Subtarget.getPreferVectorWidth() >= 128))
2760 return MVT::v4f32;
2761 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2762 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2763 // Do not use f64 to lower memcpy if source is string constant. It's
2764 // better to use i32 to avoid the loads.
2765 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2766 // The gymnastics of splatting a byte value into an XMM register and then
2767 // only using 8-byte stores (because this is a CPU with slow unaligned
2768 // 16-byte accesses) makes that a loser.
2769 return MVT::f64;
2770 }
2771 }
2772 // This is a compromise. If we reach here, unaligned accesses may be slow on
2773 // this target. However, creating smaller, aligned accesses could be even
2774 // slower and would certainly be a lot more code.
2775 if (Subtarget.is64Bit() && Op.size() >= 8)
2776 return MVT::i64;
2777 return MVT::i32;
2778}
2779
2780bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2781 if (VT == MVT::f32)
2782 return Subtarget.hasSSE1();
2783 if (VT == MVT::f64)
2784 return Subtarget.hasSSE2();
2785 return true;
2786}
2787
2788static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
2789 return (8 * Alignment.value()) % SizeInBits == 0;
2790}
2791
2792bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
2793 if (isBitAligned(Alignment, VT.getSizeInBits()))
2794 return true;
2795 switch (VT.getSizeInBits()) {
2796 default:
2797 // 8-byte and under are always assumed to be fast.
2798 return true;
2799 case 128:
2800 return !Subtarget.isUnalignedMem16Slow();
2801 case 256:
2802 return !Subtarget.isUnalignedMem32Slow();
2803 // TODO: What about AVX-512 (512-bit) accesses?
2804 }
2805}
2806
2807bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2808 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2809 unsigned *Fast) const {
2810 if (Fast)
2811 *Fast = isMemoryAccessFast(VT, Alignment);
2812 // NonTemporal vector memory ops must be aligned.
2813 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2814 // NT loads can only be vector aligned, so if its less aligned than the
2815 // minimum vector size (which we can split the vector down to), we might as
2816 // well use a regular unaligned vector load.
2817 // We don't have any NT loads pre-SSE41.
2818 if (!!(Flags & MachineMemOperand::MOLoad))
2819 return (Alignment < 16 || !Subtarget.hasSSE41());
2820 return false;
2821 }
2822 // Misaligned accesses of any size are always allowed.
2823 return true;
2824}
2825
2826bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
2827 const DataLayout &DL, EVT VT,
2828 unsigned AddrSpace, Align Alignment,
2829 MachineMemOperand::Flags Flags,
2830 unsigned *Fast) const {
2831 if (Fast)
2832 *Fast = isMemoryAccessFast(VT, Alignment);
2833 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2834 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
2835 /*Fast=*/nullptr))
2836 return true;
2837 // NonTemporal vector memory ops are special, and must be aligned.
2838 if (!isBitAligned(Alignment, VT.getSizeInBits()))
2839 return false;
2840 switch (VT.getSizeInBits()) {
2841 case 128:
2842 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
2843 return true;
2844 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
2845 return true;
2846 return false;
2847 case 256:
2848 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
2849 return true;
2850 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
2851 return true;
2852 return false;
2853 case 512:
2854 if (Subtarget.hasAVX512())
2855 return true;
2856 return false;
2857 default:
2858 return false; // Don't have NonTemporal vector memory ops of this size.
2859 }
2860 }
2861 return true;
2862}
2863
2864/// Return the entry encoding for a jump table in the
2865/// current function. The returned value is a member of the
2866/// MachineJumpTableInfo::JTEntryKind enum.
2867unsigned X86TargetLowering::getJumpTableEncoding() const {
2868 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2869 // symbol.
2870 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2871 return MachineJumpTableInfo::EK_Custom32;
2872
2873 // Otherwise, use the normal jump table encoding heuristics.
2874 return TargetLowering::getJumpTableEncoding();
2875}
2876
2877bool X86TargetLowering::splitValueIntoRegisterParts(
2878 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
2879 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
2880 bool IsABIRegCopy = CC.has_value();
2881 EVT ValueVT = Val.getValueType();
2882 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2883 unsigned ValueBits = ValueVT.getSizeInBits();
2884 unsigned PartBits = PartVT.getSizeInBits();
2885 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
2886 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
2887 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
2888 Parts[0] = Val;
2889 return true;
2890 }
2891 return false;
2892}
2893
2894SDValue X86TargetLowering::joinRegisterPartsIntoValue(
2895 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
2896 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
2897 bool IsABIRegCopy = CC.has_value();
2898 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2899 unsigned ValueBits = ValueVT.getSizeInBits();
2900 unsigned PartBits = PartVT.getSizeInBits();
2901 SDValue Val = Parts[0];
2902
2903 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
2904 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
2905 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
2906 return Val;
2907 }
2908 return SDValue();
2909}
2910
2911bool X86TargetLowering::useSoftFloat() const {
2912 return Subtarget.useSoftFloat();
2913}
2914
2915void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2916 ArgListTy &Args) const {
2917
2918 // Only relabel X86-32 for C / Stdcall CCs.
2919 if (Subtarget.is64Bit())
2920 return;
2921 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2922 return;
2923 unsigned ParamRegs = 0;
2924 if (auto *M = MF->getFunction().getParent())
2925 ParamRegs = M->getNumberRegisterParameters();
2926
2927 // Mark the first N int arguments as having reg
2928 for (auto &Arg : Args) {
2929 Type *T = Arg.Ty;
2930 if (T->isIntOrPtrTy())
2931 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2932 unsigned numRegs = 1;
2933 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2934 numRegs = 2;
2935 if (ParamRegs < numRegs)
2936 return;
2937 ParamRegs -= numRegs;
2938 Arg.IsInReg = true;
2939 }
2940 }
2941}
2942
2943const MCExpr *
2944X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2945 const MachineBasicBlock *MBB,
2946 unsigned uid,MCContext &Ctx) const{
2947 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2947, __extension__
__PRETTY_FUNCTION__))
;
2948 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2949 // entries.
2950 return MCSymbolRefExpr::create(MBB->getSymbol(),
2951 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2952}
2953
2954/// Returns relocation base for the given PIC jumptable.
2955SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2956 SelectionDAG &DAG) const {
2957 if (!Subtarget.is64Bit())
2958 // This doesn't have SDLoc associated with it, but is not really the
2959 // same as a Register.
2960 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2961 getPointerTy(DAG.getDataLayout()));
2962 return Table;
2963}
2964
2965/// This returns the relocation base for the given PIC jumptable,
2966/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2967const MCExpr *X86TargetLowering::
2968getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2969 MCContext &Ctx) const {
2970 // X86-64 uses RIP relative addressing based on the jump table label.
2971 if (Subtarget.isPICStyleRIPRel())
2972 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2973
2974 // Otherwise, the reference is relative to the PIC base.
2975 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2976}
2977
2978std::pair<const TargetRegisterClass *, uint8_t>
2979X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2980 MVT VT) const {
2981 const TargetRegisterClass *RRC = nullptr;
2982 uint8_t Cost = 1;
2983 switch (VT.SimpleTy) {
2984 default:
2985 return TargetLowering::findRepresentativeClass(TRI, VT);
2986 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2987 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2988 break;
2989 case MVT::x86mmx:
2990 RRC = &X86::VR64RegClass;
2991 break;
2992 case MVT::f32: case MVT::f64:
2993 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2994 case MVT::v4f32: case MVT::v2f64:
2995 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2996 case MVT::v8f32: case MVT::v4f64:
2997 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2998 case MVT::v16f32: case MVT::v8f64:
2999 RRC = &X86::VR128XRegClass;
3000 break;
3001 }
3002 return std::make_pair(RRC, Cost);
3003}
3004
3005unsigned X86TargetLowering::getAddressSpace() const {
3006 if (Subtarget.is64Bit())
3007 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
3008 return 256;
3009}
3010
3011static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
3012 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
3013 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
3014}
3015
3016static Constant* SegmentOffset(IRBuilderBase &IRB,
3017 int Offset, unsigned AddressSpace) {
3018 return ConstantExpr::getIntToPtr(
3019 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
3020 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
3021}
3022
3023Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
3024 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
3025 // tcbhead_t; use it instead of the usual global variable (see
3026 // sysdeps/{i386,x86_64}/nptl/tls.h)
3027 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
3028 if (Subtarget.isTargetFuchsia()) {
3029 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
3030 return SegmentOffset(IRB, 0x10, getAddressSpace());
3031 } else {
3032 unsigned AddressSpace = getAddressSpace();
3033 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
3034 // Specially, some users may customize the base reg and offset.
3035 int Offset = M->getStackProtectorGuardOffset();
3036 // If we don't set -stack-protector-guard-offset value:
3037 // %fs:0x28, unless we're using a Kernel code model, in which case
3038 // it's %gs:0x28. gs:0x14 on i386.
3039 if (Offset == INT_MAX2147483647)
3040 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
3041
3042 StringRef GuardReg = M->getStackProtectorGuardReg();
3043 if (GuardReg == "fs")
3044 AddressSpace = X86AS::FS;
3045 else if (GuardReg == "gs")
3046 AddressSpace = X86AS::GS;
3047
3048 // Use symbol guard if user specify.
3049 StringRef GuardSymb = M->getStackProtectorGuardSymbol();
3050 if (!GuardSymb.empty()) {
3051 GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
3052 if (!GV) {
3053 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
3054 : Type::getInt32Ty(M->getContext());
3055 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
3056 nullptr, GuardSymb, nullptr,
3057 GlobalValue::NotThreadLocal, AddressSpace);
3058 }
3059 return GV;
3060 }
3061
3062 return SegmentOffset(IRB, Offset, AddressSpace);
3063 }
3064 }
3065 return TargetLowering::getIRStackGuard(IRB);
3066}
3067
3068void X86TargetLowering::insertSSPDeclarations(Module &M) const {
3069 // MSVC CRT provides functionalities for stack protection.
3070 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3071 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3072 // MSVC CRT has a global variable holding security cookie.
3073 M.getOrInsertGlobal("__security_cookie",
3074 Type::getInt8PtrTy(M.getContext()));
3075
3076 // MSVC CRT has a function to validate security cookie.
3077 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
3078 "__security_check_cookie", Type::getVoidTy(M.getContext()),
3079 Type::getInt8PtrTy(M.getContext()));
3080 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
3081 F->setCallingConv(CallingConv::X86_FastCall);
3082 F->addParamAttr(0, Attribute::AttrKind::InReg);
3083 }
3084 return;
3085 }
3086
3087 StringRef GuardMode = M.getStackProtectorGuard();
3088
3089 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
3090 if ((GuardMode == "tls" || GuardMode.empty()) &&
3091 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
3092 return;
3093 TargetLowering::insertSSPDeclarations(M);
3094}
3095
3096Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
3097 // MSVC CRT has a global variable holding security cookie.
3098 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3099 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3100 return M.getGlobalVariable("__security_cookie");
3101 }
3102 return TargetLowering::getSDagStackGuard(M);
3103}
3104
3105Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
3106 // MSVC CRT has a function to validate security cookie.
3107 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3108 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3109 return M.getFunction("__security_check_cookie");
3110 }
3111 return TargetLowering::getSSPStackGuardCheck(M);
3112}
3113
3114Value *
3115X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
3116 if (Subtarget.getTargetTriple().isOSContiki())
3117 return getDefaultSafeStackPointerLocation(IRB, false);
3118
3119 // Android provides a fixed TLS slot for the SafeStack pointer. See the
3120 // definition of TLS_SLOT_SAFESTACK in
3121 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
3122 if (Subtarget.isTargetAndroid()) {
3123 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
3124 // %gs:0x24 on i386
3125 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
3126 return SegmentOffset(IRB, Offset, getAddressSpace());
3127 }
3128
3129 // Fuchsia is similar.
3130 if (Subtarget.isTargetFuchsia()) {
3131 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
3132 return SegmentOffset(IRB, 0x18, getAddressSpace());
3133 }
3134
3135 return TargetLowering::getSafeStackPointerLocation(IRB);
3136}
3137
3138//===----------------------------------------------------------------------===//
3139// Return Value Calling Convention Implementation
3140//===----------------------------------------------------------------------===//
3141
3142bool X86TargetLowering::CanLowerReturn(
3143 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3144 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3145 SmallVector<CCValAssign, 16> RVLocs;
3146 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3147 return CCInfo.CheckReturn(Outs, RetCC_X86);
3148}
3149
3150const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
3151 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
3152 return ScratchRegs;
3153}
3154
3155ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
3156 // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
3157 // tests at the moment, which is not what we expected.
3158 static const MCPhysReg RCRegs[] = {X86::MXCSR};
3159 return RCRegs;
3160}
3161
3162/// Lowers masks values (v*i1) to the local register values
3163/// \returns DAG node after lowering to register type
3164static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
3165 const SDLoc &Dl, SelectionDAG &DAG) {
3166 EVT ValVT = ValArg.getValueType();
3167
3168 if (ValVT == MVT::v1i1)
3169 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
3170 DAG.getIntPtrConstant(0, Dl));
3171
3172 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
3173 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
3174 // Two stage lowering might be required
3175 // bitcast: v8i1 -> i8 / v16i1 -> i16
3176 // anyextend: i8 -> i32 / i16 -> i32
3177 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
3178 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
3179 if (ValLoc == MVT::i32)
3180 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
3181 return ValToCopy;
3182 }
3183
3184 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
3185 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
3186 // One stage lowering is required
3187 // bitcast: v32i1 -> i32 / v64i1 -> i64
3188 return DAG.getBitcast(ValLoc, ValArg);
3189 }
3190
3191 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
3192}
3193
3194/// Breaks v64i1 value into two registers and adds the new node to the DAG
3195static void Passv64i1ArgInRegs(
3196 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
3197 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
3198 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
3199 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3199, __extension__
__PRETTY_FUNCTION__))
;
3200 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
;
3201 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3201, __extension__
__PRETTY_FUNCTION__))
;
3202 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__
__PRETTY_FUNCTION__))
3203 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__
__PRETTY_FUNCTION__))
;
3204
3205 // Before splitting the value we cast it to i64
3206 Arg = DAG.getBitcast(MVT::i64, Arg);
3207
3208 // Splitting the value into two i32 types
3209 SDValue Lo, Hi;
3210 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);
3211
3212 // Attach the two i32 types into corresponding registers
3213 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
3214 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
3215}
3216
3217SDValue
3218X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3219 bool isVarArg,
3220 const SmallVectorImpl<ISD::OutputArg> &Outs,
3221 const SmallVectorImpl<SDValue> &OutVals,
3222 const SDLoc &dl, SelectionDAG &DAG) const {
3223 MachineFunction &MF = DAG.getMachineFunction();
3224 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3225
3226 // In some cases we need to disable registers from the default CSR list.
3227 // For example, when they are used as return registers (preserve_* and X86's
3228 // regcall) or for argument passing (X86's regcall).
3229 bool ShouldDisableCalleeSavedRegister =
3230 shouldDisableRetRegFromCSR(CallConv) ||
3231 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
3232
3233 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
3234 report_fatal_error("X86 interrupts may not return any value");
3235
3236 SmallVector<CCValAssign, 16> RVLocs;
3237 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
3238 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
3239
3240 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
3241 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
3242 ++I, ++OutsIndex) {
3243 CCValAssign &VA = RVLocs[I];
3244 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3244, __extension__
__PRETTY_FUNCTION__))
;
3245
3246 // Add the register to the CalleeSaveDisableRegs list.
3247 if (ShouldDisableCalleeSavedRegister)
3248 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
3249
3250 SDValue ValToCopy = OutVals[OutsIndex];
3251 EVT ValVT = ValToCopy.getValueType();
3252
3253 // Promote values to the appropriate types.
3254 if (VA.getLocInfo() == CCValAssign::SExt)
3255 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
3256 else if (VA.getLocInfo() == CCValAssign::ZExt)
3257 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
3258 else if (VA.getLocInfo() == CCValAssign::AExt) {
3259 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
3260 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
3261 else
3262 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
3263 }
3264 else if (VA.getLocInfo() == CCValAssign::BCvt)
3265 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
3266
3267 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__
__PRETTY_FUNCTION__))
3268 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__
__PRETTY_FUNCTION__))
;
3269
3270 // Report an error if we have attempted to return a value via an XMM
3271 // register and SSE was disabled.
3272 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3273 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3274 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3275 } else if (!Subtarget.hasSSE2() &&
3276 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3277 ValVT == MVT::f64) {
3278 // When returning a double via an XMM register, report an error if SSE2 is
3279 // not enabled.
3280 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3281 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3282 }
3283
3284 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
3285 // the RET instruction and handled by the FP Stackifier.
3286 if (VA.getLocReg() == X86::FP0 ||
3287 VA.getLocReg() == X86::FP1) {
3288 // If this is a copy from an xmm register to ST(0), use an FPExtend to
3289 // change the value to the FP stack register class.
3290 if (isScalarFPTypeInSSEReg(VA.getValVT()))
3291 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
3292 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3293 // Don't emit a copytoreg.
3294 continue;
3295 }
3296
3297 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
3298 // which is returned in RAX / RDX.
3299 if (Subtarget.is64Bit()) {
3300 if (ValVT == MVT::x86mmx) {
3301 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
3302 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
3303 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
3304 ValToCopy);
3305 // If we don't have SSE2 available, convert to v4f32 so the generated
3306 // register is legal.
3307 if (!Subtarget.hasSSE2())
3308 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3309 }
3310 }
3311 }
3312
3313 if (VA.needsCustom()) {
3314 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__
__PRETTY_FUNCTION__))
3315 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__
__PRETTY_FUNCTION__))
;
3316
3317 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3318 Subtarget);
3319
3320 // Add the second register to the CalleeSaveDisableRegs list.
3321 if (ShouldDisableCalleeSavedRegister)
3322 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3323 } else {
3324 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3325 }
3326 }
3327
3328 SDValue Glue;
3329 SmallVector<SDValue, 6> RetOps;
3330 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3331 // Operand #1 = Bytes To Pop
3332 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3333 MVT::i32));
3334
3335 // Copy the result values into the output registers.
3336 for (auto &RetVal : RetVals) {
3337 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3338 RetOps.push_back(RetVal.second);
3339 continue; // Don't emit a copytoreg.
3340 }
3341
3342 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
3343 Glue = Chain.getValue(1);
3344 RetOps.push_back(
3345 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3346 }
3347
3348 // Swift calling convention does not require we copy the sret argument
3349 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3350
3351 // All x86 ABIs require that for returning structs by value we copy
3352 // the sret argument into %rax/%eax (depending on ABI) for the return.
3353 // We saved the argument into a virtual register in the entry block,
3354 // so now we copy the value out and into %rax/%eax.
3355 //
3356 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3357 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3358 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3359 // either case FuncInfo->setSRetReturnReg() will have been called.
3360 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3361 // When we have both sret and another return value, we should use the
3362 // original Chain stored in RetOps[0], instead of the current Chain updated
3363 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3364
3365 // For the case of sret and another return value, we have
3366 // Chain_0 at the function entry
3367 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3368 // If we use Chain_1 in getCopyFromReg, we will have
3369 // Val = getCopyFromReg(Chain_1)
3370 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3371
3372 // getCopyToReg(Chain_0) will be glued together with
3373 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3374 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3375 // Data dependency from Unit B to Unit A due to usage of Val in
3376 // getCopyToReg(Chain_1, Val)
3377 // Chain dependency from Unit A to Unit B
3378
3379 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3380 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3381 getPointerTy(MF.getDataLayout()));
3382
3383 Register RetValReg
3384 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3385 X86::RAX : X86::EAX;
3386 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
3387 Glue = Chain.getValue(1);
3388
3389 // RAX/EAX now acts like a return value.
3390 RetOps.push_back(
3391 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3392
3393 // Add the returned register to the CalleeSaveDisableRegs list. Don't do
3394 // this however for preserve_most/preserve_all to minimize the number of
3395 // callee-saved registers for these CCs.
3396 if (ShouldDisableCalleeSavedRegister &&
3397 CallConv != CallingConv::PreserveAll &&
3398 CallConv != CallingConv::PreserveMost)
3399 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3400 }
3401
3402 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3403 const MCPhysReg *I =
3404 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3405 if (I) {
3406 for (; *I; ++I) {
3407 if (X86::GR64RegClass.contains(*I))
3408 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3409 else
3410 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3410)
;
3411 }
3412 }
3413
3414 RetOps[0] = Chain; // Update chain.
3415
3416 // Add the glue if we have it.
3417 if (Glue.getNode())
3418 RetOps.push_back(Glue);
3419
3420 X86ISD::NodeType opcode = X86ISD::RET_GLUE;
3421 if (CallConv == CallingConv::X86_INTR)
3422 opcode = X86ISD::IRET;
3423 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3424}
3425
3426bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3427 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3428 return false;
3429
3430 SDValue TCChain = Chain;
3431 SDNode *Copy = *N->use_begin();
3432 if (Copy->getOpcode() == ISD::CopyToReg) {
3433 // If the copy has a glue operand, we conservatively assume it isn't safe to
3434 // perform a tail call.
3435 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3436 return false;
3437 TCChain = Copy->getOperand(0);
3438 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3439 return false;
3440
3441 bool HasRet = false;
3442 for (const SDNode *U : Copy->uses()) {
3443 if (U->getOpcode() != X86ISD::RET_GLUE)
3444 return false;
3445 // If we are returning more than one value, we can definitely
3446 // not make a tail call see PR19530
3447 if (U->getNumOperands() > 4)
3448 return false;
3449 if (U->getNumOperands() == 4 &&
3450 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3451 return false;
3452 HasRet = true;
3453 }
3454
3455 if (!HasRet)
3456 return false;
3457
3458 Chain = TCChain;
3459 return true;
3460}
3461
3462EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3463 ISD::NodeType ExtendKind) const {
3464 MVT ReturnMVT = MVT::i32;
3465
3466 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3467 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3468 // The ABI does not require i1, i8 or i16 to be extended.
3469 //
3470 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3471 // always extending i8/i16 return values, so keep doing that for now.
3472 // (PR26665).
3473 ReturnMVT = MVT::i8;
3474 }
3475
3476 EVT MinVT = getRegisterType(Context, ReturnMVT);
3477 return VT.bitsLT(MinVT) ? MinVT : VT;
3478}
3479
3480/// Reads two 32 bit registers and creates a 64 bit mask value.
3481/// \param VA The current 32 bit value that need to be assigned.
3482/// \param NextVA The next 32 bit value that need to be assigned.
3483/// \param Root The parent DAG node.
3484/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
3485/// glue purposes. In the case the DAG is already using
3486/// physical register instead of virtual, we should glue
3487/// our new SDValue to InGlue SDvalue.
3488/// \return a new SDvalue of size 64bit.
3489static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3490 SDValue &Root, SelectionDAG &DAG,
3491 const SDLoc &Dl, const X86Subtarget &Subtarget,
3492 SDValue *InGlue = nullptr) {
3493 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3493, __extension__
__PRETTY_FUNCTION__))
;
3494 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3494, __extension__
__PRETTY_FUNCTION__))
;
3495 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__
__PRETTY_FUNCTION__))
3496 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__
__PRETTY_FUNCTION__))
;
3497 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__
__PRETTY_FUNCTION__))
3498 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__
__PRETTY_FUNCTION__))
;
3499 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__))
3500 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__))
;
3501
3502 SDValue Lo, Hi;
3503 SDValue ArgValueLo, ArgValueHi;
3504
3505 MachineFunction &MF = DAG.getMachineFunction();
3506 const TargetRegisterClass *RC = &X86::GR32RegClass;
3507
3508 // Read a 32 bit value from the registers.
3509 if (nullptr == InGlue) {
3510 // When no physical register is present,
3511 // create an intermediate virtual register.
3512 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3513 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3514 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3515 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3516 } else {
3517 // When a physical register is available read the value from it and glue
3518 // the reads together.
3519 ArgValueLo =
3520 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);
3521 *InGlue = ArgValueLo.getValue(2);
3522 ArgValueHi =
3523 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);
3524 *InGlue = ArgValueHi.getValue(2);
3525 }
3526
3527 // Convert the i32 type into v32i1 type.
3528 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3529
3530 // Convert the i32 type into v32i1 type.
3531 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3532
3533 // Concatenate the two values together.
3534 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3535}
3536
3537/// The function will lower a register of various sizes (8/16/32/64)
3538/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3539/// \returns a DAG node contains the operand after lowering to mask type.
3540static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3541 const EVT &ValLoc, const SDLoc &Dl,
3542 SelectionDAG &DAG) {
3543 SDValue ValReturned = ValArg;
3544
3545 if (ValVT == MVT::v1i1)
3546 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3547
3548 if (ValVT == MVT::v64i1) {
3549 // In 32 bit machine, this case is handled by getv64i1Argument
3550 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3550, __extension__
__PRETTY_FUNCTION__))
;
3551 // In 64 bit machine, There is no need to truncate the value only bitcast
3552 } else {
3553 MVT maskLen;
3554 switch (ValVT.getSimpleVT().SimpleTy) {
3555 case MVT::v8i1:
3556 maskLen = MVT::i8;
3557 break;
3558 case MVT::v16i1:
3559 maskLen = MVT::i16;
3560 break;
3561 case MVT::v32i1:
3562 maskLen = MVT::i32;
3563 break;
3564 default:
3565 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3565)
;
3566 }
3567
3568 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3569 }
3570 return DAG.getBitcast(ValVT, ValReturned);
3571}
3572
3573/// Lower the result values of a call into the
3574/// appropriate copies out of appropriate physical registers.
3575///
3576SDValue X86TargetLowering::LowerCallResult(
3577 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
3578 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3579 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3580 uint32_t *RegMask) const {
3581
3582 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3583 // Assign locations to each value returned by this call.
3584 SmallVector<CCValAssign, 16> RVLocs;
3585 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3586 *DAG.getContext());
3587 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3588
3589 // Copy all of the result registers out of their specified physreg.
3590 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3591 ++I, ++InsIndex) {
3592 CCValAssign &VA = RVLocs[I];
3593 EVT CopyVT = VA.getLocVT();
3594
3595 // In some calling conventions we need to remove the used registers
3596 // from the register mask.
3597 if (RegMask) {
3598 for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
3599 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
3600 }
3601
3602 // Report an error if there was an attempt to return FP values via XMM
3603 // registers.
3604 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3605 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3606 if (VA.getLocReg() == X86::XMM1)
3607 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3608 else
3609 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3610 } else if (!Subtarget.hasSSE2() &&
3611 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3612 CopyVT == MVT::f64) {
3613 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3614 if (VA.getLocReg() == X86::XMM1)
3615 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3616 else
3617 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3618 }
3619
3620 // If we prefer to use the value in xmm registers, copy it out as f80 and
3621 // use a truncate to move it from fp stack reg to xmm reg.
3622 bool RoundAfterCopy = false;
3623 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3624 isScalarFPTypeInSSEReg(VA.getValVT())) {
3625 if (!Subtarget.hasX87())
3626 report_fatal_error("X87 register return with X87 disabled");
3627 CopyVT = MVT::f80;
3628 RoundAfterCopy = (CopyVT != VA.getLocVT());
3629 }
3630
3631 SDValue Val;
3632 if (VA.needsCustom()) {
3633 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__
__PRETTY_FUNCTION__))
3634 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__
__PRETTY_FUNCTION__))
;
3635 Val =
3636 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
3637 } else {
3638 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
3639 .getValue(1);
3640 Val = Chain.getValue(0);
3641 InGlue = Chain.getValue(2);
3642 }
3643
3644 if (RoundAfterCopy)
3645 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3646 // This truncation won't change the value.
3647 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
3648
3649 if (VA.isExtInLoc()) {
3650 if (VA.getValVT().isVector() &&
3651 VA.getValVT().getScalarType() == MVT::i1 &&
3652 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3653 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3654 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3655 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3656 } else
3657 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3658 }
3659
3660 if (VA.getLocInfo() == CCValAssign::BCvt)
3661 Val = DAG.getBitcast(VA.getValVT(), Val);
3662
3663 InVals.push_back(Val);
3664 }
3665
3666 return Chain;
3667}
3668
3669//===----------------------------------------------------------------------===//
3670// C & StdCall & Fast Calling Convention implementation
3671//===----------------------------------------------------------------------===//
3672// StdCall calling convention seems to be standard for many Windows' API
3673// routines and around. It differs from C calling convention just a little:
3674// callee should clean up the stack, not caller. Symbols should be also
3675// decorated in some fancy way :) It doesn't support any vector arguments.
3676// For info on fast calling convention see Fast Calling Convention (tail call)
3677// implementation LowerX86_32FastCCCallTo.
3678
3679/// Determines whether Args, either a set of outgoing arguments to a call, or a
3680/// set of incoming args of a call, contains an sret pointer that the callee
3681/// pops
3682template <typename T>
3683static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3684 const X86Subtarget &Subtarget) {
3685 // Not C++20 (yet), so no concepts available.
3686 static_assert(std::is_same_v<T, ISD::OutputArg> ||
3687 std::is_same_v<T, ISD::InputArg>,
3688 "requires ISD::OutputArg or ISD::InputArg");
3689
3690 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
3691 // for most compilations.
3692 if (!Subtarget.is32Bit())
3693 return false;
3694
3695 if (Args.empty())
3696 return false;
3697
3698 // Most calls do not have an sret argument, check the arg next.
3699 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3700 if (!Flags.isSRet() || Flags.isInReg())
3701 return false;
3702
3703 // The MSVCabi does not pop the sret.
3704 if (Subtarget.getTargetTriple().isOSMSVCRT())
3705 return false;
3706
3707 // MCUs don't pop the sret
3708 if (Subtarget.isTargetMCU())
3709 return false;
3710
3711 // Callee pops argument
3712 return true;
3713}
3714
3715/// Make a copy of an aggregate at address specified by "Src" to address
3716/// "Dst" with size and alignment information specified by the specific
3717/// parameter attribute. The copy will be passed as a byval function parameter.
3718static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3719 SDValue Chain, ISD::ArgFlagsTy Flags,
3720 SelectionDAG &DAG, const SDLoc &dl) {
3721 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3722
3723 return DAG.getMemcpy(
3724 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3725 /*isVolatile*/ false, /*AlwaysInline=*/true,
3726 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3727}
3728
3729/// Return true if the calling convention is one that we can guarantee TCO for.
3730static bool canGuaranteeTCO(CallingConv::ID CC) {
3731 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3732 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3733 CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
3734}
3735
3736/// Return true if we might ever do TCO for calls with this calling convention.
3737static bool mayTailCallThisCC(CallingConv::ID CC) {
3738 switch (CC) {
3739 // C calling conventions:
3740 case CallingConv::C:
3741 case CallingConv::Win64:
3742 case CallingConv::X86_64_SysV:
3743 // Callee pop conventions:
3744 case CallingConv::X86_ThisCall:
3745 case CallingConv::X86_StdCall:
3746 case CallingConv::X86_VectorCall:
3747 case CallingConv::X86_FastCall:
3748 // Swift:
3749 case CallingConv::Swift:
3750 return true;
3751 default:
3752 return canGuaranteeTCO(CC);
3753 }
3754}
3755
3756/// Return true if the function is being made into a tailcall target by
3757/// changing its ABI.
3758static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3759 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3760 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3761}
3762
3763bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3764 if (!CI->isTailCall())
3765 return false;
3766
3767 CallingConv::ID CalleeCC = CI->getCallingConv();
3768 if (!mayTailCallThisCC(CalleeCC))
3769 return false;
3770
3771 return true;
3772}
3773
3774SDValue
3775X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3776 const SmallVectorImpl<ISD::InputArg> &Ins,
3777 const SDLoc &dl, SelectionDAG &DAG,
3778 const CCValAssign &VA,
3779 MachineFrameInfo &MFI, unsigned i) const {
3780 // Create the nodes corresponding to a load from this parameter slot.
3781 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3782 bool AlwaysUseMutable = shouldGuaranteeTCO(
3783 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3784 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3785 EVT ValVT;
3786 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3787
3788 // If value is passed by pointer we have address passed instead of the value
3789 // itself. No need to extend if the mask value and location share the same
3790 // absolute size.
3791 bool ExtendedInMem =
3792 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3793 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3794
3795 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3796 ValVT = VA.getLocVT();
3797 else
3798 ValVT = VA.getValVT();
3799
3800 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3801 // changed with more analysis.
3802 // In case of tail call optimization mark all arguments mutable. Since they
3803 // could be overwritten by lowering of arguments in case of a tail call.
3804 if (Flags.isByVal()) {
3805 unsigned Bytes = Flags.getByValSize();
3806 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3807
3808 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3809 // can be improved with deeper analysis.
3810 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3811 /*isAliased=*/true);
3812 return DAG.getFrameIndex(FI, PtrVT);
3813 }
3814
3815 EVT ArgVT = Ins[i].ArgVT;
3816
3817 // If this is a vector that has been split into multiple parts, and the
3818 // scalar size of the parts don't match the vector element size, then we can't
3819 // elide the copy. The parts will have padding between them instead of being
3820 // packed like a vector.
3821 bool ScalarizedAndExtendedVector =
3822 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3823 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3824
3825 // This is an argument in memory. We might be able to perform copy elision.
3826 // If the argument is passed directly in memory without any extension, then we
3827 // can perform copy elision. Large vector types, for example, may be passed
3828 // indirectly by pointer.
3829 if (Flags.isCopyElisionCandidate() &&
3830 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3831 !ScalarizedAndExtendedVector) {
3832 SDValue PartAddr;
3833 if (Ins[i].PartOffset == 0) {
3834 // If this is a one-part value or the first part of a multi-part value,
3835 // create a stack object for the entire argument value type and return a
3836 // load from our portion of it. This assumes that if the first part of an
3837 // argument is in memory, the rest will also be in memory.
3838 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3839 /*IsImmutable=*/false);
3840 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3841 return DAG.getLoad(
3842 ValVT, dl, Chain, PartAddr,
3843 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3844 } else {
3845 // This is not the first piece of an argument in memory. See if there is
3846 // already a fixed stack object including this offset. If so, assume it
3847 // was created by the PartOffset == 0 branch above and create a load from
3848 // the appropriate offset into it.
3849 int64_t PartBegin = VA.getLocMemOffset();
3850 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3851 int FI = MFI.getObjectIndexBegin();
3852 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3853 int64_t ObjBegin = MFI.getObjectOffset(FI);
3854 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3855 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3856 break;
3857 }
3858 if (MFI.isFixedObjectIndex(FI)) {
3859 SDValue Addr =
3860 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3861 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3862 return DAG.getLoad(
3863 ValVT, dl, Chain, Addr,
3864 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3865 Ins[i].PartOffset));
3866 }
3867 }
3868 }
3869
3870 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3871 VA.getLocMemOffset(), isImmutable);
3872
3873 // Set SExt or ZExt flag.
3874 if (VA.getLocInfo() == CCValAssign::ZExt) {
3875 MFI.setObjectZExt(FI, true);
3876 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3877 MFI.setObjectSExt(FI, true);
3878 }
3879
3880 MaybeAlign Alignment;
3881 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3882 ValVT != MVT::f80)
3883 Alignment = MaybeAlign(4);
3884 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3885 SDValue Val = DAG.getLoad(
3886 ValVT, dl, Chain, FIN,
3887 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3888 Alignment);
3889 return ExtendedInMem
3890 ? (VA.getValVT().isVector()
3891 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3892 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3893 : Val;
3894}
3895
3896// FIXME: Get this from tablegen.
3897static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3898 const X86Subtarget &Subtarget) {
3899 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3899, __extension__ __PRETTY_FUNCTION__))
;
3900
3901 if (Subtarget.isCallingConvWin64(CallConv)) {
3902 static const MCPhysReg GPR64ArgRegsWin64[] = {
3903 X86::RCX, X86::RDX, X86::R8, X86::R9
3904 };
3905 return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3906 }
3907
3908 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3909 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3910 };
3911 return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3912}
3913
3914// FIXME: Get this from tablegen.
3915static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3916 CallingConv::ID CallConv,
3917 const X86Subtarget &Subtarget) {
3918 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3918, __extension__ __PRETTY_FUNCTION__))
;
3919 if (Subtarget.isCallingConvWin64(CallConv)) {
3920 // The XMM registers which might contain var arg parameters are shadowed
3921 // in their paired GPR. So we only need to save the GPR to their home
3922 // slots.
3923 // TODO: __vectorcall will change this.
3924 return std::nullopt;
3925 }
3926
3927 bool isSoftFloat = Subtarget.useSoftFloat();
3928 if (isSoftFloat || !Subtarget.hasSSE1())
3929 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3930 // registers.
3931 return std::nullopt;
3932
3933 static const MCPhysReg XMMArgRegs64Bit[] = {
3934 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3935 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3936 };
3937 return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3938}
3939
3940#ifndef NDEBUG
3941static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3942 return llvm::is_sorted(
3943 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3944 return A.getValNo() < B.getValNo();
3945 });
3946}
3947#endif
3948
3949namespace {
3950/// This is a helper class for lowering variable arguments parameters.
3951class VarArgsLoweringHelper {
3952public:
3953 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3954 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3955 CallingConv::ID CallConv, CCState &CCInfo)
3956 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3957 TheMachineFunction(DAG.getMachineFunction()),
3958 TheFunction(TheMachineFunction.getFunction()),
3959 FrameInfo(TheMachineFunction.getFrameInfo()),
3960 FrameLowering(*Subtarget.getFrameLowering()),
3961 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3962 CCInfo(CCInfo) {}
3963
3964 // Lower variable arguments parameters.
3965 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3966
3967private:
3968 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3969
3970 void forwardMustTailParameters(SDValue &Chain);
3971
3972 bool is64Bit() const { return Subtarget.is64Bit(); }
3973 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3974
3975 X86MachineFunctionInfo *FuncInfo;
3976 const SDLoc &DL;
3977 SelectionDAG &DAG;
3978 const X86Subtarget &Subtarget;
3979 MachineFunction &TheMachineFunction;
3980 const Function &TheFunction;
3981 MachineFrameInfo &FrameInfo;
3982 const TargetFrameLowering &FrameLowering;
3983 const TargetLowering &TargLowering;
3984 CallingConv::ID CallConv;
3985 CCState &CCInfo;
3986};
3987} // namespace
3988
3989void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3990 SDValue &Chain, unsigned StackSize) {
3991 // If the function takes variable number of arguments, make a frame index for
3992 // the start of the first vararg value... for expansion of llvm.va_start. We
3993 // can skip this if there are no va_start calls.
3994 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3995 CallConv != CallingConv::X86_ThisCall)) {
3996 FuncInfo->setVarArgsFrameIndex(
3997 FrameInfo.CreateFixedObject(1, StackSize, true));
3998 }
3999
4000 // 64-bit calling conventions support varargs and register parameters, so we
4001 // have to do extra work to spill them in the prologue.
4002 if (is64Bit()) {
4003 // Find the first unallocated argument registers.
4004 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
4005 ArrayRef<MCPhysReg> ArgXMMs =
4006 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
4007 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
4008 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
4009
4010 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__
__PRETTY_FUNCTION__))
4011 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__
__PRETTY_FUNCTION__))
;
4012
4013 if (isWin64()) {
4014 // Get to the caller-allocated home save location. Add 8 to account
4015 // for the return address.
4016 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
4017 FuncInfo->setRegSaveFrameIndex(
4018 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
4019 // Fixup to set vararg frame on shadow area (4 x i64).
4020 if (NumIntRegs < 4)
4021 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
4022 } else {
4023 // For X86-64, if there are vararg parameters that are passed via
4024 // registers, then we must store them to their spots on the stack so
4025 // they may be loaded by dereferencing the result of va_next.
4026 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
4027 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
4028 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
4029 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
4030 }
4031
4032 SmallVector<SDValue, 6>
4033 LiveGPRs; // list of SDValue for GPR registers keeping live input value
4034 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
4035 // keeping live input value
4036 SDValue ALVal; // if applicable keeps SDValue for %al register
4037
4038 // Gather all the live in physical registers.
4039 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
4040 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
4041 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
4042 }
4043 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
4044 if (!AvailableXmms.empty()) {
4045 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4046 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
4047 for (MCPhysReg Reg : AvailableXmms) {
4048 // FastRegisterAllocator spills virtual registers at basic
4049 // block boundary. That leads to usages of xmm registers
4050 // outside of check for %al. Pass physical registers to
4051 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
4052 TheMachineFunction.getRegInfo().addLiveIn(Reg);
4053 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
4054 }
4055 }
4056
4057 // Store the integer parameter registers.
4058 SmallVector<SDValue, 8> MemOps;
4059 SDValue RSFIN =
4060 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
4061 TargLowering.getPointerTy(DAG.getDataLayout()));
4062 unsigned Offset = FuncInfo->getVarArgsGPOffset();
4063 for (SDValue Val : LiveGPRs) {
4064 SDValue FIN = DAG.getNode(ISD::ADD, DL,
4065 TargLowering.getPointerTy(DAG.getDataLayout()),
4066 RSFIN, DAG.getIntPtrConstant(Offset, DL));
4067 SDValue Store =
4068 DAG.getStore(Val.getValue(1), DL, Val, FIN,
4069 MachinePointerInfo::getFixedStack(
4070 DAG.getMachineFunction(),
4071 FuncInfo->getRegSaveFrameIndex(), Offset));
4072 MemOps.push_back(Store);
4073 Offset += 8;
4074 }
4075
4076 // Now store the XMM (fp + vector) parameter registers.
4077 if (!LiveXMMRegs.empty()) {
4078 SmallVector<SDValue, 12> SaveXMMOps;
4079 SaveXMMOps.push_back(Chain);
4080 SaveXMMOps.push_back(ALVal);
4081 SaveXMMOps.push_back(RSFIN);
4082 SaveXMMOps.push_back(
4083 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
4084 llvm::append_range(SaveXMMOps, LiveXMMRegs);
4085 MachineMemOperand *StoreMMO =
4086 DAG.getMachineFunction().getMachineMemOperand(
4087 MachinePointerInfo::getFixedStack(
4088 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
4089 Offset),
4090 MachineMemOperand::MOStore, 128, Align(16));
4091 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
4092 DL, DAG.getVTList(MVT::Other),
4093 SaveXMMOps, MVT::i8, StoreMMO));
4094 }
4095
4096 if (!MemOps.empty())
4097 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4098 }
4099}
4100
4101void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
4102 // Find the largest legal vector type.
4103 MVT VecVT = MVT::Other;
4104 // FIXME: Only some x86_32 calling conventions support AVX512.
4105 if (Subtarget.useAVX512Regs() &&
4106 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
4107 CallConv == CallingConv::Intel_OCL_BI)))
4108 VecVT = MVT::v16f32;
4109 else if (Subtarget.hasAVX())
4110 VecVT = MVT::v8f32;
4111 else if (Subtarget.hasSSE2())
4112 VecVT = MVT::v4f32;
4113
4114 // We forward some GPRs and some vector types.
4115 SmallVector<MVT, 2> RegParmTypes;
4116 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
4117 RegParmTypes.push_back(IntVT);
4118 if (VecVT != MVT::Other)
4119 RegParmTypes.push_back(VecVT);
4120
4121 // Compute the set of forwarded registers. The rest are scratch.
4122 SmallVectorImpl<ForwardedRegister> &Forwards =
4123 FuncInfo->getForwardedMustTailRegParms();
4124 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
4125
4126 // Forward AL for SysV x86_64 targets, since it is used for varargs.
4127 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
4128 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4129 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
4130 }
4131
4132 // Copy all forwards from physical to virtual registers.
4133 for (ForwardedRegister &FR : Forwards) {
4134 // FIXME: Can we use a less constrained schedule?
4135 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
4136 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
4137 TargLowering.getRegClassFor(FR.VT));
4138 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
4139 }
4140}
4141
4142void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
4143 unsigned StackSize) {
4144 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
4145 // If necessary, it would be set into the correct value later.
4146 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
4147 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4148
4149 if (FrameInfo.hasVAStart())
4150 createVarArgAreaAndStoreRegisters(Chain, StackSize);
4151
4152 if (FrameInfo.hasMustTailInVarArgFunc())
4153 forwardMustTailParameters(Chain);
4154}
4155
4156SDValue X86TargetLowering::LowerFormalArguments(
4157 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
4158 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4159 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4160 MachineFunction &MF = DAG.getMachineFunction();
4161 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4162
4163 const Function &F = MF.getFunction();
4164 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
4165 F.getName() == "main")
4166 FuncInfo->setForceFramePointer(true);
4167
4168 MachineFrameInfo &MFI = MF.getFrameInfo();
4169 bool Is64Bit = Subtarget.is64Bit();
4170 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4171
4172 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))
4173 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))
4174 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))
;
4175
4176 // Assign locations to all of the incoming arguments.
4177 SmallVector<CCValAssign, 16> ArgLocs;
4178 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4179
4180 // Allocate shadow area for Win64.
4181 if (IsWin64)
4182 CCInfo.AllocateStack(32, Align(8));
4183
4184 CCInfo.AnalyzeArguments(Ins, CC_X86);
4185
4186 // In vectorcall calling convention a second pass is required for the HVA
4187 // types.
4188 if (CallingConv::X86_VectorCall == CallConv) {
4189 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
4190 }
4191
4192 // The next loop assumes that the locations are in the same order of the
4193 // input arguments.
4194 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__))
4195 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__))
;
4196
4197 SDValue ArgValue;
4198 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
4199 ++I, ++InsIndex) {
4200 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4200, __extension__
__PRETTY_FUNCTION__))
;
4201 CCValAssign &VA = ArgLocs[I];
4202
4203 if (VA.isRegLoc()) {
4204 EVT RegVT = VA.getLocVT();
4205 if (VA.needsCustom()) {
4206 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))
4207 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))
4208 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))
;
4209
4210 // v64i1 values, in regcall calling convention, that are
4211 // compiled to 32 bit arch, are split up into two registers.
4212 ArgValue =
4213 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
4214 } else {
4215 const TargetRegisterClass *RC;
4216 if (RegVT == MVT::i8)
4217 RC = &X86::GR8RegClass;
4218 else if (RegVT == MVT::i16)
4219 RC = &X86::GR16RegClass;
4220 else if (RegVT == MVT::i32)
4221 RC = &X86::GR32RegClass;
4222 else if (Is64Bit && RegVT == MVT::i64)
4223 RC = &X86::GR64RegClass;
4224 else if (RegVT == MVT::f16)
4225 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
4226 else if (RegVT == MVT::f32)
4227 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
4228 else if (RegVT == MVT::f64)
4229 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
4230 else if (RegVT == MVT::f80)
4231 RC = &X86::RFP80RegClass;
4232 else if (RegVT == MVT::f128)
4233 RC = &X86::VR128RegClass;
4234 else if (RegVT.is512BitVector())
4235 RC = &X86::VR512RegClass;
4236 else if (RegVT.is256BitVector())
4237 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
4238 else if (RegVT.is128BitVector())
4239 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
4240 else if (RegVT == MVT::x86mmx)
4241 RC = &X86::VR64RegClass;
4242 else if (RegVT == MVT::v1i1)
4243 RC = &X86::VK1RegClass;
4244 else if (RegVT == MVT::v8i1)
4245 RC = &X86::VK8RegClass;
4246 else if (RegVT == MVT::v16i1)
4247 RC = &X86::VK16RegClass;
4248 else if (RegVT == MVT::v32i1)
4249 RC = &X86::VK32RegClass;
4250 else if (RegVT == MVT::v64i1)
4251 RC = &X86::VK64RegClass;
4252 else
4253 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4253)
;
4254
4255 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4256 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4257 }
4258
4259 // If this is an 8 or 16-bit value, it is really passed promoted to 32
4260 // bits. Insert an assert[sz]ext to capture this, then truncate to the
4261 // right size.
4262 if (VA.getLocInfo() == CCValAssign::SExt)
4263 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4264 DAG.getValueType(VA.getValVT()));
4265 else if (VA.getLocInfo() == CCValAssign::ZExt)
4266 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4267 DAG.getValueType(VA.getValVT()));
4268 else if (VA.getLocInfo() == CCValAssign::BCvt)
4269 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
4270
4271 if (VA.isExtInLoc()) {
4272 // Handle MMX values passed in XMM regs.
4273 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
4274 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
4275 else if (VA.getValVT().isVector() &&
4276 VA.getValVT().getScalarType() == MVT::i1 &&
4277 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
4278 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
4279 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
4280 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
4281 } else
4282 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4283 }
4284 } else {
4285 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4285, __extension__ __PRETTY_FUNCTION__))
;
4286 ArgValue =
4287 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
4288 }
4289
4290 // If value is passed via pointer - do a load.
4291 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
4292 ArgValue =
4293 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
4294
4295 InVals.push_back(ArgValue);
4296 }
4297
4298 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4299 if (Ins[I].Flags.isSwiftAsync()) {
4300 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
4301 if (Subtarget.is64Bit())
4302 X86FI->setHasSwiftAsyncContext(true);
4303 else {
4304 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
4305 X86FI->setSwiftAsyncContextFrameIdx(FI);
4306 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
4307 DAG.getFrameIndex(FI, MVT::i32),
4308 MachinePointerInfo::getFixedStack(MF, FI));
4309 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4310 }
4311 }
4312
4313 // Swift calling convention does not require we copy the sret argument
4314 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4315 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4316 continue;
4317
4318 // All x86 ABIs require that for returning structs by value we copy the
4319 // sret argument into %rax/%eax (depending on ABI) for the return. Save
4320 // the argument into a virtual register so that we can access it from the
4321 // return points.
4322 if (Ins[I].Flags.isSRet()) {
4323 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__
__PRETTY_FUNCTION__))
4324 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__
__PRETTY_FUNCTION__))
;
4325 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4326 Register Reg =
4327 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4328 FuncInfo->setSRetReturnReg(Reg);
4329 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4330 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4331 break;
4332 }
4333 }
4334
4335 unsigned StackSize = CCInfo.getNextStackOffset();
4336 // Align stack specially for tail calls.
4337 if (shouldGuaranteeTCO(CallConv,
4338 MF.getTarget().Options.GuaranteedTailCallOpt))
4339 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4340
4341 if (IsVarArg)
4342 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4343 .lowerVarArgsParameters(Chain, StackSize);
4344
4345 // Some CCs need callee pop.
4346 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4347 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4348 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4349 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4350 // X86 interrupts must pop the error code (and the alignment padding) if
4351 // present.
4352 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4353 } else {
4354 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4355 // If this is an sret function, the return should pop the hidden pointer.
4356 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4357 FuncInfo->setBytesToPopOnReturn(4);
4358 }
4359
4360 if (!Is64Bit) {
4361 // RegSaveFrameIndex is X86-64 only.
4362 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4363 }
4364
4365 FuncInfo->setArgumentStackSize(StackSize);
4366
4367 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4368 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4369 if (Personality == EHPersonality::CoreCLR) {
4370 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4370,
__extension__ __PRETTY_FUNCTION__))
;
4371 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4372 // that we'd prefer this slot be allocated towards the bottom of the frame
4373 // (i.e. near the stack pointer after allocating the frame). Every
4374 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4375 // offset from the bottom of this and each funclet's frame must be the
4376 // same, so the size of funclets' (mostly empty) frames is dictated by
4377 // how far this slot is from the bottom (since they allocate just enough
4378 // space to accommodate holding this slot at the correct offset).
4379 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4380 EHInfo->PSPSymFrameIdx = PSPSymFI;
4381 }
4382 }
4383
4384 if (shouldDisableArgRegFromCSR(CallConv) ||
4385 F.hasFnAttribute("no_caller_saved_registers")) {
4386 MachineRegisterInfo &MRI = MF.getRegInfo();
4387 for (std::pair<Register, Register> Pair : MRI.liveins())
4388 MRI.disableCalleeSavedRegister(Pair.first);
4389 }
4390
4391 return Chain;
4392}
4393
4394SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4395 SDValue Arg, const SDLoc &dl,
4396 SelectionDAG &DAG,
4397 const CCValAssign &VA,
4398 ISD::ArgFlagsTy Flags,
4399 bool isByVal) const {
4400 unsigned LocMemOffset = VA.getLocMemOffset();
4401 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4402 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4403 StackPtr, PtrOff);
4404 if (isByVal)
4405 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4406
4407 MaybeAlign Alignment;
4408 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4409 Arg.getSimpleValueType() != MVT::f80)
4410 Alignment = MaybeAlign(4);
4411 return DAG.getStore(
4412 Chain, dl, Arg, PtrOff,
4413 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4414 Alignment);
4415}
4416
4417/// Emit a load of return address if tail call
4418/// optimization is performed and it is required.
4419SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4420 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4421 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4422 // Adjust the Return address stack slot.
4423 EVT VT = getPointerTy(DAG.getDataLayout());
4424 OutRetAddr = getReturnAddressFrameIndex(DAG);
4425
4426 // Load the "old" Return address.
4427 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4428 return SDValue(OutRetAddr.getNode(), 1);
4429}
4430
4431/// Emit a store of the return address if tail call
4432/// optimization is performed and it is required (FPDiff!=0).
4433static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4434 SDValue Chain, SDValue RetAddrFrIdx,
4435 EVT PtrVT, unsigned SlotSize,
4436 int FPDiff, const SDLoc &dl) {
4437 // Store the return address to the appropriate stack slot.
4438 if (!FPDiff) return Chain;
4439 // Calculate the new stack slot for the return address.
4440 int NewReturnAddrFI =
4441 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4442 false);
4443 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4444 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4445 MachinePointerInfo::getFixedStack(
4446 DAG.getMachineFunction(), NewReturnAddrFI));
4447 return Chain;
4448}
4449
4450/// Returns a vector_shuffle mask for an movs{s|d}, movd
4451/// operation of specified width.
4452static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4453 SDValue V2) {
4454 unsigned NumElems = VT.getVectorNumElements();
4455 SmallVector<int, 8> Mask;
4456 Mask.push_back(NumElems);
4457 for (unsigned i = 1; i != NumElems; ++i)
4458 Mask.push_back(i);
4459 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4460}
4461
4462SDValue
4463X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4464 SmallVectorImpl<SDValue> &InVals) const {
4465 SelectionDAG &DAG = CLI.DAG;
4466 SDLoc &dl = CLI.DL;
4467 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4468 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4469 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4470 SDValue Chain = CLI.Chain;
4471 SDValue Callee = CLI.Callee;
4472 CallingConv::ID CallConv = CLI.CallConv;
4473 bool &isTailCall = CLI.IsTailCall;
4474 bool isVarArg = CLI.IsVarArg;
4475 const auto *CB = CLI.CB;
4476
4477 MachineFunction &MF = DAG.getMachineFunction();
4478 bool Is64Bit = Subtarget.is64Bit();
4479 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4480 bool IsSibcall = false;
4481 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4482 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4483 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4484 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4485 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4486 CB->hasFnAttr("no_caller_saved_registers"));
4487 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4488 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4489 bool IsCFICall = IsIndirectCall && CLI.CFIType;
4490 const Module *M = MF.getMMI().getModule();
4491 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4492
4493 MachineFunction::CallSiteInfo CSInfo;
4494 if (CallConv == CallingConv::X86_INTR)
4495 report_fatal_error("X86 interrupts may not be called directly");
4496
4497 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4498 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4499 // If we are using a GOT, disable tail calls to external symbols with
4500 // default visibility. Tail calling such a symbol requires using a GOT
4501 // relocation, which forces early binding of the symbol. This breaks code
4502 // that require lazy function symbol resolution. Using musttail or
4503 // GuaranteedTailCallOpt will override this.
4504 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4505 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4506 G->getGlobal()->hasDefaultVisibility()))
4507 isTailCall = false;
4508 }
4509
4510 if (isTailCall && !IsMustTail) {
4511 // Check if it's really possible to do a tail call.
4512 isTailCall = IsEligibleForTailCallOptimization(
4513 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4514 Ins, DAG);
4515
4516 // Sibcalls are automatically detected tailcalls which do not require
4517 // ABI changes.
4518 if (!IsGuaranteeTCO && isTailCall)
4519 IsSibcall = true;
4520
4521 if (isTailCall)
4522 ++NumTailCalls;
4523 }
4524
4525 if (IsMustTail && !isTailCall)
4526 report_fatal_error("failed to perform tail call elimination on a call "
4527 "site marked musttail");
4528
4529 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__
__PRETTY_FUNCTION__))
4530 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__
__PRETTY_FUNCTION__))
;
4531
4532 // Analyze operands of the call, assigning locations to each operand.
4533 SmallVector<CCValAssign, 16> ArgLocs;
4534 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4535
4536 // Allocate shadow area for Win64.
4537 if (IsWin64)
4538 CCInfo.AllocateStack(32, Align(8));
4539
4540 CCInfo.AnalyzeArguments(Outs, CC_X86);
4541
4542 // In vectorcall calling convention a second pass is required for the HVA
4543 // types.
4544 if (CallingConv::X86_VectorCall == CallConv) {
4545 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4546 }
4547
4548 // Get a count of how many bytes are to be pushed on the stack.
4549 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4550 if (IsSibcall)
4551 // This is a sibcall. The memory operands are available in caller's
4552 // own caller's stack.
4553 NumBytes = 0;
4554 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4555 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4556
4557 int FPDiff = 0;
4558 if (isTailCall &&
4559 shouldGuaranteeTCO(CallConv,
4560 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4561 // Lower arguments at fp - stackoffset + fpdiff.
4562 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4563
4564 FPDiff = NumBytesCallerPushed - NumBytes;
4565
4566 // Set the delta of movement of the returnaddr stackslot.
4567 // But only set if delta is greater than previous delta.
4568 if (FPDiff < X86Info->getTCReturnAddrDelta())
4569 X86Info->setTCReturnAddrDelta(FPDiff);
4570 }
4571
4572 unsigned NumBytesToPush = NumBytes;
4573 unsigned NumBytesToPop = NumBytes;
4574
4575 // If we have an inalloca argument, all stack space has already been allocated
4576 // for us and be right at the top of the stack. We don't support multiple
4577 // arguments passed in memory when using inalloca.
4578 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4579 NumBytesToPush = 0;
4580 if (!ArgLocs.back().isMemLoc())
4581 report_fatal_error("cannot use inalloca attribute on a register "
4582 "parameter");
4583 if (ArgLocs.back().getLocMemOffset() != 0)
4584 report_fatal_error("any parameter with the inalloca attribute must be "
4585 "the only memory argument");
4586 } else if (CLI.IsPreallocated) {
4587 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))
4588 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))
4589 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))
;
4590 SmallVector<size_t, 4> PreallocatedOffsets;
4591 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4592 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4593 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4594 }
4595 }
4596 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4597 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4598 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4599 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4600 NumBytesToPush = 0;
4601 }
4602
4603 if (!IsSibcall && !IsMustTail)
4604 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4605 NumBytes - NumBytesToPush, dl);
4606
4607 SDValue RetAddrFrIdx;
4608 // Load return address for tail calls.
4609 if (isTailCall && FPDiff)
4610 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4611 Is64Bit, FPDiff, dl);
4612
4613 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4614 SmallVector<SDValue, 8> MemOpChains;
4615 SDValue StackPtr;
4616
4617 // The next loop assumes that the locations are in the same order of the
4618 // input arguments.
4619 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__
__PRETTY_FUNCTION__))
4620 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__
__PRETTY_FUNCTION__))
;
4621
4622 // Walk the register/memloc assignments, inserting copies/loads. In the case
4623 // of tail call optimization arguments are handle later.
4624 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4625 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4626 ++I, ++OutIndex) {
4627 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4627, __extension__
__PRETTY_FUNCTION__))
;
4628 // Skip inalloca/preallocated arguments, they have already been written.
4629 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4630 if (Flags.isInAlloca() || Flags.isPreallocated())
4631 continue;
4632
4633 CCValAssign &VA = ArgLocs[I];
4634 EVT RegVT = VA.getLocVT();
4635 SDValue Arg = OutVals[OutIndex];
4636 bool isByVal = Flags.isByVal();
4637
4638 // Promote the value if needed.
4639 switch (VA.getLocInfo()) {
4640 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4640)
;
4641 case CCValAssign::Full: break;
4642 case CCValAssign::SExt:
4643 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4644 break;
4645 case CCValAssign::ZExt:
4646 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4647 break;
4648 case CCValAssign::AExt:
4649 if (Arg.getValueType().isVector() &&
4650 Arg.getValueType().getVectorElementType() == MVT::i1)
4651 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4652 else if (RegVT.is128BitVector()) {
4653 // Special case: passing MMX values in XMM registers.
4654 Arg = DAG.getBitcast(MVT::i64, Arg);
4655 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4656 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4657 } else
4658 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4659 break;
4660 case CCValAssign::BCvt:
4661 Arg = DAG.getBitcast(RegVT, Arg);
4662 break;
4663 case CCValAssign::Indirect: {
4664 if (isByVal) {
4665 // Memcpy the argument to a temporary stack slot to prevent
4666 // the caller from seeing any modifications the callee may make
4667 // as guaranteed by the `byval` attribute.
4668 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4669 Flags.getByValSize(),
4670 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4671 SDValue StackSlot =
4672 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4673 Chain =
4674 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4675 // From now on treat this as a regular pointer
4676 Arg = StackSlot;
4677 isByVal = false;
4678 } else {
4679 // Store the argument.
4680 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4681 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4682 Chain = DAG.getStore(
4683 Chain, dl, Arg, SpillSlot,
4684 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4685 Arg = SpillSlot;
4686 }
4687 break;
4688 }
4689 }
4690
4691 if (VA.needsCustom()) {
4692 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__
__PRETTY_FUNCTION__))
4693 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__
__PRETTY_FUNCTION__))
;
4694 // Split v64i1 value into two registers
4695 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4696 } else if (VA.isRegLoc()) {
4697 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4698 const TargetOptions &Options = DAG.getTarget().Options;
4699 if (Options.EmitCallSiteInfo)
4700 CSInfo.emplace_back(VA.getLocReg(), I);
4701 if (isVarArg && IsWin64) {
4702 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4703 // shadow reg if callee is a varargs function.
4704 Register ShadowReg;
4705 switch (VA.getLocReg()) {
4706 case X86::XMM0: ShadowReg = X86::RCX; break;
4707 case X86::XMM1: ShadowReg = X86::RDX; break;
4708 case X86::XMM2: ShadowReg = X86::R8; break;
4709 case X86::XMM3: ShadowReg = X86::R9; break;
4710 }
4711 if (ShadowReg)
4712 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4713 }
4714 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4715 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4715, __extension__ __PRETTY_FUNCTION__))
;
4716 if (!StackPtr.getNode())
4717 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4718 getPointerTy(DAG.getDataLayout()));
4719 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4720 dl, DAG, VA, Flags, isByVal));
4721 }
4722 }
4723
4724 if (!MemOpChains.empty())
4725 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4726
4727 if (Subtarget.isPICStyleGOT()) {
4728 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4729 // GOT pointer (except regcall).
4730 if (!isTailCall) {
4731 // Indirect call with RegCall calling convertion may use up all the
4732 // general registers, so it is not suitable to bind EBX reister for
4733 // GOT address, just let register allocator handle it.
4734 if (CallConv != CallingConv::X86_RegCall)
4735 RegsToPass.push_back(std::make_pair(
4736 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4737 getPointerTy(DAG.getDataLayout()))));
4738 } else {
4739 // If we are tail calling and generating PIC/GOT style code load the
4740 // address of the callee into ECX. The value in ecx is used as target of
4741 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4742 // for tail calls on PIC/GOT architectures. Normally we would just put the
4743 // address of GOT into ebx and then call target@PLT. But for tail calls
4744 // ebx would be restored (since ebx is callee saved) before jumping to the
4745 // target@PLT.
4746
4747 // Note: The actual moving to ECX is done further down.
4748 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4749 if (G && !G->getGlobal()->hasLocalLinkage() &&
4750 G->getGlobal()->hasDefaultVisibility())
4751 Callee = LowerGlobalAddress(Callee, DAG);
4752 else if (isa<ExternalSymbolSDNode>(Callee))
4753 Callee = LowerExternalSymbol(Callee, DAG);
4754 }
4755 }
4756
4757 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4758 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4759 // From AMD64 ABI document:
4760 // For calls that may call functions that use varargs or stdargs
4761 // (prototype-less calls or calls to functions containing ellipsis (...) in
4762 // the declaration) %al is used as hidden argument to specify the number
4763 // of SSE registers used. The contents of %al do not need to match exactly
4764 // the number of registers, but must be an ubound on the number of SSE
4765 // registers used and is in the range 0 - 8 inclusive.
4766
4767 // Count the number of XMM registers allocated.
4768 static const MCPhysReg XMMArgRegs[] = {
4769 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4770 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4771 };
4772 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4773 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__
__PRETTY_FUNCTION__))
4774 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__
__PRETTY_FUNCTION__))
;
4775 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4776 DAG.getConstant(NumXMMRegs, dl,
4777 MVT::i8)));
4778 }
4779
4780 if (isVarArg && IsMustTail) {
4781 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4782 for (const auto &F : Forwards) {
4783 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4784 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4785 }
4786 }
4787
4788 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4789 // don't need this because the eligibility check rejects calls that require
4790 // shuffling arguments passed in memory.
4791 if (!IsSibcall && isTailCall) {
4792 // Force all the incoming stack arguments to be loaded from the stack
4793 // before any new outgoing arguments are stored to the stack, because the
4794 // outgoing stack slots may alias the incoming argument stack slots, and
4795 // the alias isn't otherwise explicit. This is slightly more conservative
4796 // than necessary, because it means that each store effectively depends
4797 // on every argument instead of just those arguments it would clobber.
4798 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4799
4800 SmallVector<SDValue, 8> MemOpChains2;
4801 SDValue FIN;
4802 int FI = 0;
4803 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4804 ++I, ++OutsIndex) {
4805 CCValAssign &VA = ArgLocs[I];
4806
4807 if (VA.isRegLoc()) {
4808 if (VA.needsCustom()) {
4809 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__
__PRETTY_FUNCTION__))
4810 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__
__PRETTY_FUNCTION__))
;
4811 // This means that we are in special case where one argument was
4812 // passed through two register locations - Skip the next location
4813 ++I;
4814 }
4815
4816 continue;
4817 }
4818
4819 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4819, __extension__ __PRETTY_FUNCTION__))
;
4820 SDValue Arg = OutVals[OutsIndex];
4821 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4822 // Skip inalloca/preallocated arguments. They don't require any work.
4823 if (Flags.isInAlloca() || Flags.isPreallocated())
4824 continue;
4825 // Create frame index.
4826 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4827 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4828 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4829 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4830
4831 if (Flags.isByVal()) {
4832 // Copy relative to framepointer.
4833 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4834 if (!StackPtr.getNode())
4835 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4836 getPointerTy(DAG.getDataLayout()));
4837 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4838 StackPtr, Source);
4839
4840 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4841 ArgChain,
4842 Flags, DAG, dl));
4843 } else {
4844 // Store relative to framepointer.
4845 MemOpChains2.push_back(DAG.getStore(
4846 ArgChain, dl, Arg, FIN,
4847 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4848 }
4849 }
4850
4851 if (!MemOpChains2.empty())
4852 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4853
4854 // Store the return address to the appropriate stack slot.
4855 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4856 getPointerTy(DAG.getDataLayout()),
4857 RegInfo->getSlotSize(), FPDiff, dl);
4858 }
4859
4860 // Build a sequence of copy-to-reg nodes chained together with token chain
4861 // and glue operands which copy the outgoing args into registers.
4862 SDValue InGlue;
4863 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4864 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4865 RegsToPass[i].second, InGlue);
4866 InGlue = Chain.getValue(1);
4867 }
4868
4869 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4870 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4870, __extension__
__PRETTY_FUNCTION__))
;
4871 // In the 64-bit large code model, we have to make all calls
4872 // through a register, since the call instruction's 32-bit
4873 // pc-relative offset may not be large enough to hold the whole
4874 // address.
4875 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4876 Callee->getOpcode() == ISD::ExternalSymbol) {
4877 // Lower direct calls to global addresses and external symbols. Setting
4878 // ForCall to true here has the effect of removing WrapperRIP when possible
4879 // to allow direct calls to be selected without first materializing the
4880 // address into a register.
4881 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4882 } else if (Subtarget.isTarget64BitILP32() &&
4883 Callee.getValueType() == MVT::i32) {
4884 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4885 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4886 }
4887
4888 // Returns a chain & a glue for retval copy to use.
4889 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4890 SmallVector<SDValue, 8> Ops;
4891
4892 if (!IsSibcall && isTailCall && !IsMustTail) {
4893 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
4894 InGlue = Chain.getValue(1);
4895 }
4896
4897 Ops.push_back(Chain);
4898 Ops.push_back(Callee);
4899
4900 if (isTailCall)
4901 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4902
4903 // Add argument registers to the end of the list so that they are known live
4904 // into the call.
4905 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4906 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4907 RegsToPass[i].second.getValueType()));
4908
4909 // Add a register mask operand representing the call-preserved registers.
4910 const uint32_t *Mask = [&]() {
4911 auto AdaptedCC = CallConv;
4912 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4913 // use X86_INTR calling convention because it has the same CSR mask
4914 // (same preserved registers).
4915 if (HasNCSR)
4916 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4917 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4918 // to use the CSR_NoRegs_RegMask.
4919 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4920 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4921 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4922 }();
4923 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4923, __extension__
__PRETTY_FUNCTION__))
;
4924
4925 // If this is an invoke in a 32-bit function using a funclet-based
4926 // personality, assume the function clobbers all registers. If an exception
4927 // is thrown, the runtime will not restore CSRs.
4928 // FIXME: Model this more precisely so that we can register allocate across
4929 // the normal edge and spill and fill across the exceptional edge.
4930 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4931 const Function &CallerFn = MF.getFunction();
4932 EHPersonality Pers =
4933 CallerFn.hasPersonalityFn()
4934 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4935 : EHPersonality::Unknown;
4936 if (isFuncletEHPersonality(Pers))
4937 Mask = RegInfo->getNoPreservedMask();
4938 }
4939
4940 // Define a new register mask from the existing mask.
4941 uint32_t *RegMask = nullptr;
4942
4943 // In some calling conventions we need to remove the used physical registers
4944 // from the reg mask. Create a new RegMask for such calling conventions.
4945 // RegMask for calling conventions that disable only return registers (e.g.
4946 // preserve_most) will be modified later in LowerCallResult.
4947 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
4948 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
4949 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4950
4951 // Allocate a new Reg Mask and copy Mask.
4952 RegMask = MF.allocateRegMask();
4953 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4954 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4955
4956 // Make sure all sub registers of the argument registers are reset
4957 // in the RegMask.
4958 if (ShouldDisableArgRegs) {
4959 for (auto const &RegPair : RegsToPass)
4960 for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
4961 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
4962 }
4963
4964 // Create the RegMask Operand according to our updated mask.
4965 Ops.push_back(DAG.getRegisterMask(RegMask));
4966 } else {
4967 // Create the RegMask Operand according to the static mask.
4968 Ops.push_back(DAG.getRegisterMask(Mask));
4969 }
4970
4971 if (InGlue.getNode())
4972 Ops.push_back(InGlue);
4973
4974 if (isTailCall) {
4975 // We used to do:
4976 //// If this is the first return lowered for this function, add the regs
4977 //// to the liveout set for the function.
4978 // This isn't right, although it's probably harmless on x86; liveouts
4979 // should be computed from returns not tail calls. Consider a void
4980 // function making a tail call to a function returning int.
4981 MF.getFrameInfo().setHasTailCall();
4982 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4983
4984 if (IsCFICall)
4985 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4986
4987 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4988 return Ret;
4989 }
4990
4991 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4992 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4993 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4994 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4995 // expanded to the call, directly followed by a special marker sequence and
4996 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4997 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__
__PRETTY_FUNCTION__))
4998 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__
__PRETTY_FUNCTION__))
;
4999 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4999, __extension__
__PRETTY_FUNCTION__))
;
5000
5001 // Add a target global address for the retainRV/claimRV runtime function
5002 // just before the call target.
5003 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
5004 auto PtrVT = getPointerTy(DAG.getDataLayout());
5005 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
5006 Ops.insert(Ops.begin() + 1, GA);
5007 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
5008 } else {
5009 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
5010 }
5011
5012 if (IsCFICall)
5013 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
5014
5015 InGlue = Chain.getValue(1);
5016 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
5017 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
5018
5019 // Save heapallocsite metadata.
5020 if (CLI.CB)
5021 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
5022 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
5023
5024 // Create the CALLSEQ_END node.
5025 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
5026 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
5027 DAG.getTarget().Options.GuaranteedTailCallOpt))
5028 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
5029 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
5030 // If this call passes a struct-return pointer, the callee
5031 // pops that struct pointer.
5032 NumBytesForCalleeToPop = 4;
5033
5034 // Returns a glue for retval copy to use.
5035 if (!IsSibcall) {
5036 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
5037 InGlue, dl);
5038 InGlue = Chain.getValue(1);
5039 }
5040
5041 // Handle result values, copying them out of physregs into vregs that we
5042 // return.
5043 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
5044 InVals, RegMask);
5045}
5046
5047//===----------------------------------------------------------------------===//
5048// Fast Calling Convention (tail call) implementation
5049//===----------------------------------------------------------------------===//
5050
5051// Like std call, callee cleans arguments, convention except that ECX is
5052// reserved for storing the tail called function address. Only 2 registers are
5053// free for argument passing (inreg). Tail call optimization is performed
5054// provided:
5055// * tailcallopt is enabled
5056// * caller/callee are fastcc
5057// On X86_64 architecture with GOT-style position independent code only local
5058// (within module) calls are supported at the moment.
5059// To keep the stack aligned according to platform abi the function
5060// GetAlignedArgumentStackSize ensures that argument delta is always multiples
5061// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
5062// If a tail called function callee has more arguments than the caller the
5063// caller needs to make sure that there is room to move the RETADDR to. This is
5064// achieved by reserving an area the size of the argument delta right after the
5065// original RETADDR, but before the saved framepointer or the spilled registers
5066// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
5067// stack layout:
5068// arg1
5069// arg2
5070// RETADDR
5071// [ new RETADDR
5072// move area ]
5073// (possible EBP)
5074// ESI
5075// EDI
5076// local1 ..
5077
5078/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
5079/// requirement.
5080unsigned
5081X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
5082 SelectionDAG &DAG) const {
5083 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
5084 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
5085 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__
__PRETTY_FUNCTION__))
5086 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__
__PRETTY_FUNCTION__))
;
5087 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
5088}
5089
5090/// Return true if the given stack call argument is already available in the
5091/// same position (relatively) of the caller's incoming argument stack.
5092static
5093bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
5094 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
5095 const X86InstrInfo *TII, const CCValAssign &VA) {
5096 unsigned Bytes = Arg.getValueSizeInBits() / 8;
5097
5098 for (;;) {
5099 // Look through nodes that don't alter the bits of the incoming value.
5100 unsigned Op = Arg.getOpcode();
5101 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
5102 Arg = Arg.getOperand(0);
5103 continue;
5104 }
5105 if (Op == ISD::TRUNCATE) {
5106 const SDValue &TruncInput = Arg.getOperand(0);
5107 if (TruncInput.getOpcode() == ISD::AssertZext &&
5108 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
5109 Arg.getValueType()) {
5110 Arg = TruncInput.getOperand(0);
5111 continue;
5112 }
5113 }
5114 break;
5115 }
5116
5117 int FI = INT_MAX2147483647;
5118 if (Arg.getOpcode() == ISD::CopyFromReg) {
5119 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
5120 if (!VR.isVirtual())
5121 return false;
5122 MachineInstr *Def = MRI->getVRegDef(VR);
5123 if (!Def)
5124 return false;
5125 if (!Flags.isByVal()) {
5126 if (!TII->isLoadFromStackSlot(*Def, FI))
5127 return false;
5128 } else {
5129 unsigned Opcode = Def->getOpcode();
5130 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
5131 Opcode == X86::LEA64_32r) &&
5132 Def->getOperand(1).isFI()) {
5133 FI = Def->getOperand(1).getIndex();
5134 Bytes = Flags.getByValSize();
5135 } else
5136 return false;
5137 }
5138 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
5139 if (Flags.isByVal())
5140 // ByVal argument is passed in as a pointer but it's now being
5141 // dereferenced. e.g.
5142 // define @foo(%struct.X* %A) {
5143 // tail call @bar(%struct.X* byval %A)
5144 // }
5145 return false;
5146 SDValue Ptr = Ld->getBasePtr();
5147 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
5148 if (!FINode)
5149 return false;
5150 FI = FINode->getIndex();
5151 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
5152 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
5153 FI = FINode->getIndex();
5154 Bytes = Flags.getByValSize();
5155 } else
5156 return false;
5157
5158 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5158, __extension__ __PRETTY_FUNCTION__))
;
5159 if (!MFI.isFixedObjectIndex(FI))
5160 return false;
5161
5162 if (Offset != MFI.getObjectOffset(FI))
5163 return false;
5164
5165 // If this is not byval, check that the argument stack object is immutable.
5166 // inalloca and argument copy elision can create mutable argument stack
5167 // objects. Byval objects can be mutated, but a byval call intends to pass the
5168 // mutated memory.
5169 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
5170 return false;
5171
5172 if (VA.getLocVT().getFixedSizeInBits() >
5173 Arg.getValueSizeInBits().getFixedValue()) {
5174 // If the argument location is wider than the argument type, check that any
5175 // extension flags match.
5176 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
5177 Flags.isSExt() != MFI.isObjectSExt(FI)) {
5178 return false;
5179 }
5180 }
5181
5182 return Bytes == MFI.getObjectSize(FI);
5183}
5184
5185/// Check whether the call is eligible for tail call optimization. Targets
5186/// that want to do tail call optimization should implement this function.
5187bool X86TargetLowering::IsEligibleForTailCallOptimization(
5188 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
5189 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
5190 const SmallVectorImpl<SDValue> &OutVals,
5191 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5192 if (!mayTailCallThisCC(CalleeCC))
5193 return false;
5194
5195 // If -tailcallopt is specified, make fastcc functions tail-callable.
5196 MachineFunction &MF = DAG.getMachineFunction();
5197 const Function &CallerF = MF.getFunction();
5198
5199 // If the function return type is x86_fp80 and the callee return type is not,
5200 // then the FP_EXTEND of the call result is not a nop. It's not safe to
5201 // perform a tailcall optimization here.
5202 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
5203 return false;
5204
5205 CallingConv::ID CallerCC = CallerF.getCallingConv();
5206 bool CCMatch = CallerCC == CalleeCC;
5207 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
5208 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
5209 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
5210 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
5211
5212 // Win64 functions have extra shadow space for argument homing. Don't do the
5213 // sibcall if the caller and callee have mismatched expectations for this
5214 // space.
5215 if (IsCalleeWin64 != IsCallerWin64)
5216 return false;
5217
5218 if (IsGuaranteeTCO) {
5219 if (canGuaranteeTCO(CalleeCC) && CCMatch)
5220 return true;
5221 return false;
5222 }
5223
5224 // Look for obvious safe cases to perform tail call optimization that do not
5225 // require ABI changes. This is what gcc calls sibcall.
5226
5227 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
5228 // emit a special epilogue.
5229 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5230 if (RegInfo->hasStackRealignment(MF))
5231 return false;
5232
5233 // Also avoid sibcall optimization if we're an sret return fn and the callee
5234 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
5235 // insufficient.
5236 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
5237 // For a compatible tail call the callee must return our sret pointer. So it
5238 // needs to be (a) an sret function itself and (b) we pass our sret as its
5239 // sret. Condition #b is harder to determine.
5240 return false;
5241 } else if (IsCalleePopSRet)
5242 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
5243 // expect that.
5244 return false;
5245
5246 // Do not sibcall optimize vararg calls unless all arguments are passed via
5247 // registers.
5248 LLVMContext &C = *DAG.getContext();
5249 if (isVarArg && !Outs.empty()) {
5250 // Optimizing for varargs on Win64 is unlikely to be safe without
5251 // additional testing.
5252 if (IsCalleeWin64 || IsCallerWin64)
5253 return false;
5254
5255 SmallVector<CCValAssign, 16> ArgLocs;
5256 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5257
5258 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5259 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
5260 if (!ArgLocs[i].isRegLoc())
5261 return false;
5262 }
5263
5264 // If the call result is in ST0 / ST1, it needs to be popped off the x87
5265 // stack. Therefore, if it's not used by the call it is not safe to optimize
5266 // this into a sibcall.
5267 bool Unused = false;
5268 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5269 if (!Ins[i].Used) {
5270 Unused = true;
5271 break;
5272 }
5273 }
5274 if (Unused) {
5275 SmallVector<CCValAssign, 16> RVLocs;
5276 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
5277 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
5278 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5279 CCValAssign &VA = RVLocs[i];
5280 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
5281 return false;
5282 }
5283 }
5284
5285 // Check that the call results are passed in the same way.
5286 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5287 RetCC_X86, RetCC_X86))
5288 return false;
5289 // The callee has to preserve all registers the caller needs to preserve.
5290 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5291 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5292 if (!CCMatch) {
5293 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5294 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5295 return false;
5296 }
5297
5298 unsigned StackArgsSize = 0;
5299
5300 // If the callee takes no arguments then go on to check the results of the
5301 // call.
5302 if (!Outs.empty()) {
5303 // Check if stack adjustment is needed. For now, do not do this if any
5304 // argument is passed on the stack.
5305 SmallVector<CCValAssign, 16> ArgLocs;
5306 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5307
5308 // Allocate shadow area for Win64
5309 if (IsCalleeWin64)
5310 CCInfo.AllocateStack(32, Align(8));
5311
5312 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5313 StackArgsSize = CCInfo.getNextStackOffset();
5314
5315 if (CCInfo.getNextStackOffset()) {
5316 // Check if the arguments are already laid out in the right way as
5317 // the caller's fixed stack objects.
5318 MachineFrameInfo &MFI = MF.getFrameInfo();
5319 const MachineRegisterInfo *MRI = &MF.getRegInfo();
5320 const X86InstrInfo *TII = Subtarget.getInstrInfo();
5321 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5322 CCValAssign &VA = ArgLocs[i];
5323 SDValue Arg = OutVals[i];
5324 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5325 if (VA.getLocInfo() == CCValAssign::Indirect)
5326 return false;
5327 if (!VA.isRegLoc()) {
5328 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5329 MFI, MRI, TII, VA))
5330 return false;
5331 }
5332 }
5333 }
5334
5335 bool PositionIndependent = isPositionIndependent();
5336 // If the tailcall address may be in a register, then make sure it's
5337 // possible to register allocate for it. In 32-bit, the call address can
5338 // only target EAX, EDX, or ECX since the tail call must be scheduled after
5339 // callee-saved registers are restored. These happen to be the same
5340 // registers used to pass 'inreg' arguments so watch out for those.
5341 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5342 !isa<ExternalSymbolSDNode>(Callee)) ||
5343 PositionIndependent)) {
5344 unsigned NumInRegs = 0;
5345 // In PIC we need an extra register to formulate the address computation
5346 // for the callee.
5347 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5348
5349 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5350 CCValAssign &VA = ArgLocs[i];
5351 if (!VA.isRegLoc())
5352 continue;
5353 Register Reg = VA.getLocReg();
5354 switch (Reg) {
5355 default: break;
5356 case X86::EAX: case X86::EDX: case X86::ECX:
5357 if (++NumInRegs == MaxInRegs)
5358 return false;
5359 break;
5360 }
5361 }
5362 }
5363
5364 const MachineRegisterInfo &MRI = MF.getRegInfo();
5365 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5366 return false;
5367 }
5368
5369 bool CalleeWillPop =
5370 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5371 MF.getTarget().Options.GuaranteedTailCallOpt);
5372
5373 if (unsigned BytesToPop =
5374 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5375 // If we have bytes to pop, the callee must pop them.
5376 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5377 if (!CalleePopMatches)
5378 return false;
5379 } else if (CalleeWillPop && StackArgsSize > 0) {
5380 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5381 return false;
5382 }
5383
5384 return true;
5385}
5386
5387FastISel *
5388X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5389 const TargetLibraryInfo *libInfo) const {
5390 return X86::createFastISel(funcInfo, libInfo);
5391}
5392
5393//===----------------------------------------------------------------------===//
5394// Other Lowering Hooks
5395//===----------------------------------------------------------------------===//
5396
5397bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5398 bool AssumeSingleUse) {
5399 if (!AssumeSingleUse && !Op.hasOneUse())
5400 return false;
5401 if (!ISD::isNormalLoad(Op.getNode()))
5402 return false;
5403
5404 // If this is an unaligned vector, make sure the target supports folding it.
5405 auto *Ld = cast<LoadSDNode>(Op.getNode());
5406 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5407 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
5408 return false;
5409
5410 // TODO: If this is a non-temporal load and the target has an instruction
5411 // for it, it should not be folded. See "useNonTemporalLoad()".
5412
5413 return true;
5414}
5415
5416bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5417 const X86Subtarget &Subtarget,
5418 bool AssumeSingleUse) {
5419 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5419, __extension__
__PRETTY_FUNCTION__))
;
5420 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5421 return false;
5422
5423 // We can not replace a wide volatile load with a broadcast-from-memory,
5424 // because that would narrow the load, which isn't legal for volatiles.
5425 auto *Ld = cast<LoadSDNode>(Op.getNode());
5426 return !Ld->isVolatile() ||
5427 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5428}
5429
5430bool X86::mayFoldIntoStore(SDValue Op) {
5431 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5432}
5433
5434bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5435 if (Op.hasOneUse()) {
5436 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5437 return (ISD::ZERO_EXTEND == Opcode);
5438 }
5439 return false;
5440}
5441
5442static bool isTargetShuffle(unsigned Opcode) {
5443 switch(Opcode) {
5444 default: return false;
5445 case X86ISD::BLENDI:
5446 case X86ISD::PSHUFB:
5447 case X86ISD::PSHUFD:
5448 case X86ISD::PSHUFHW:
5449 case X86ISD::PSHUFLW:
5450 case X86ISD::SHUFP:
5451 case X86ISD::INSERTPS:
5452 case X86ISD::EXTRQI:
5453 case X86ISD::INSERTQI:
5454 case X86ISD::VALIGN:
5455 case X86ISD::PALIGNR:
5456 case X86ISD::VSHLDQ:
5457 case X86ISD::VSRLDQ:
5458 case X86ISD::MOVLHPS:
5459 case X86ISD::MOVHLPS:
5460 case X86ISD::MOVSHDUP:
5461 case X86ISD::MOVSLDUP:
5462 case X86ISD::MOVDDUP:
5463 case X86ISD::MOVSS:
5464 case X86ISD::MOVSD:
5465 case X86ISD::MOVSH:
5466 case X86ISD::UNPCKL:
5467 case X86ISD::UNPCKH:
5468 case X86ISD::VBROADCAST:
5469 case X86ISD::VPERMILPI:
5470 case X86ISD::VPERMILPV:
5471 case X86ISD::VPERM2X128:
5472 case X86ISD::SHUF128:
5473 case X86ISD::VPERMIL2:
5474 case X86ISD::VPERMI:
5475 case X86ISD::VPPERM:
5476 case X86ISD::VPERMV:
5477 case X86ISD::VPERMV3:
5478 case X86ISD::VZEXT_MOVL:
5479 return true;
5480 }
5481}
5482
5483static bool isTargetShuffleVariableMask(unsigned Opcode) {
5484 switch (Opcode) {
5485 default: return false;
5486 // Target Shuffles.
5487 case X86ISD::PSHUFB:
5488 case X86ISD::VPERMILPV:
5489 case X86ISD::VPERMIL2:
5490 case X86ISD::VPPERM:
5491 case X86ISD::VPERMV:
5492 case X86ISD::VPERMV3:
5493 return true;
5494 // 'Faux' Target Shuffles.
5495 case ISD::OR:
5496 case ISD::AND:
5497 case X86ISD::ANDNP:
5498 return true;
5499 }
5500}
5501
5502SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5503 MachineFunction &MF = DAG.getMachineFunction();
5504 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5505 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5506 int ReturnAddrIndex = FuncInfo->getRAIndex();
5507
5508 if (ReturnAddrIndex == 0) {
5509 // Set up a frame object for the return address.
5510 unsigned SlotSize = RegInfo->getSlotSize();
5511 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5512 -(int64_t)SlotSize,
5513 false);
5514 FuncInfo->setRAIndex(ReturnAddrIndex);
5515 }
5516
5517 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5518}
5519
5520bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5521 bool hasSymbolicDisplacement) {
5522 // Offset should fit into 32 bit immediate field.
5523 if (!isInt<32>(Offset))
5524 return false;
5525
5526 // If we don't have a symbolic displacement - we don't have any extra
5527 // restrictions.
5528 if (!hasSymbolicDisplacement)
5529 return true;
5530
5531 // FIXME: Some tweaks might be needed for medium code model.
5532 if (M != CodeModel::Small && M != CodeModel::Kernel)
5533 return false;
5534
5535 // For small code model we assume that latest object is 16MB before end of 31
5536 // bits boundary. We may also accept pretty large negative constants knowing
5537 // that all objects are in the positive half of address space.
5538 if (M == CodeModel::Small && Offset < 16*1024*1024)
5539 return true;
5540
5541 // For kernel code model we know that all object resist in the negative half
5542 // of 32bits address space. We may not accept negative offsets, since they may
5543 // be just off and we may accept pretty large positive ones.
5544 if (M == CodeModel::Kernel && Offset >= 0)
5545 return true;
5546
5547 return false;
5548}
5549
5550/// Determines whether the callee is required to pop its own arguments.
5551/// Callee pop is necessary to support tail calls.
5552bool X86::isCalleePop(CallingConv::ID CallingConv,
5553 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5554 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5555 // can guarantee TCO.
5556 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5557 return true;
5558
5559 switch (CallingConv) {
5560 default:
5561 return false;
5562 case CallingConv::X86_StdCall:
5563 case CallingConv::X86_FastCall:
5564 case CallingConv::X86_ThisCall:
5565 case CallingConv::X86_VectorCall:
5566 return !is64Bit;
5567 }
5568}
5569
5570/// Return true if the condition is an signed comparison operation.
5571static bool isX86CCSigned(unsigned X86CC) {
5572 switch (X86CC) {
5573 default:
5574 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5574)
;
5575 case X86::COND_E:
5576 case X86::COND_NE:
5577 case X86::COND_B:
5578 case X86::COND_A:
5579 case X86::COND_BE:
5580 case X86::COND_AE:
5581 return false;
5582 case X86::COND_G:
5583 case X86::COND_GE:
5584 case X86::COND_L:
5585 case X86::COND_LE:
5586 return true;
5587 }
5588}
5589
5590static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5591 switch (SetCCOpcode) {
5592 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5592)
;
5593 case ISD::SETEQ: return X86::COND_E;
5594 case ISD::SETGT: return X86::COND_G;
5595 case ISD::SETGE: return X86::COND_GE;
5596 case ISD::SETLT: return X86::COND_L;
5597 case ISD::SETLE: return X86::COND_LE;
5598 case ISD::SETNE: return X86::COND_NE;
5599 case ISD::SETULT: return X86::COND_B;
5600 case ISD::SETUGT: return X86::COND_A;
5601 case ISD::SETULE: return X86::COND_BE;
5602 case ISD::SETUGE: return X86::COND_AE;
5603 }
5604}
5605
5606/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5607/// condition code, returning the condition code and the LHS/RHS of the
5608/// comparison to make.
5609static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5610 bool isFP, SDValue &LHS, SDValue &RHS,
5611 SelectionDAG &DAG) {
5612 if (!isFP) {
5613 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5614 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5615 // X > -1 -> X == 0, jump !sign.
5616 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5617 return X86::COND_NS;
5618 }
5619 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5620 // X < 0 -> X == 0, jump on sign.
5621 return X86::COND_S;
5622 }
5623 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5624 // X >= 0 -> X == 0, jump on !sign.
5625 return X86::COND_NS;
5626 }
5627 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5628 // X < 1 -> X <= 0
5629 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5630 return X86::COND_LE;
5631 }
5632 }
5633
5634 return TranslateIntegerX86CC(SetCCOpcode);
5635 }
5636
5637 // First determine if it is required or is profitable to flip the operands.
5638
5639 // If LHS is a foldable load, but RHS is not, flip the condition.
5640 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5641 !ISD::isNON_EXTLoad(RHS.getNode())) {
5642 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5643 std::swap(LHS, RHS);
5644 }
5645
5646 switch (SetCCOpcode) {
5647 default: break;
5648 case ISD::SETOLT:
5649 case ISD::SETOLE:
5650 case ISD::SETUGT:
5651 case ISD::SETUGE:
5652 std::swap(LHS, RHS);
5653 break;
5654 }
5655
5656 // On a floating point condition, the flags are set as follows:
5657 // ZF PF CF op
5658 // 0 | 0 | 0 | X > Y
5659 // 0 | 0 | 1 | X < Y
5660 // 1 | 0 | 0 | X == Y
5661 // 1 | 1 | 1 | unordered
5662 switch (SetCCOpcode) {
5663 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5663)
;
5664 case ISD::SETUEQ:
5665 case ISD::SETEQ: return X86::COND_E;
5666 case ISD::SETOLT: // flipped
5667 case ISD::SETOGT:
5668 case ISD::SETGT: return X86::COND_A;
5669 case ISD::SETOLE: // flipped
5670 case ISD::SETOGE:
5671 case ISD::SETGE: return X86::COND_AE;
5672 case ISD::SETUGT: // flipped
5673 case ISD::SETULT:
5674 case ISD::SETLT: return X86::COND_B;
5675 case ISD::SETUGE: // flipped
5676 case ISD::SETULE:
5677 case ISD::SETLE: return X86::COND_BE;
5678 case ISD::SETONE:
5679 case ISD::SETNE: return X86::COND_NE;
5680 case ISD::SETUO: return X86::COND_P;
5681 case ISD::SETO: return X86::COND_NP;
5682 case ISD::SETOEQ:
5683 case ISD::SETUNE: return X86::COND_INVALID;
5684 }
5685}
5686
5687/// Is there a floating point cmov for the specific X86 condition code?
5688/// Current x86 isa includes the following FP cmov instructions:
5689/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5690static bool hasFPCMov(unsigned X86CC) {
5691 switch (X86CC) {
5692 default:
5693 return false;
5694 case X86::COND_B:
5695 case X86::COND_BE:
5696 case X86::COND_E:
5697 case X86::COND_P:
5698 case X86::COND_A:
5699 case X86::COND_AE:
5700 case X86::COND_NE:
5701 case X86::COND_NP:
5702 return true;
5703 }
5704}
5705
5706static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5707 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5708 VT.is512BitVector();
5709}
5710
5711bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5712 const CallInst &I,
5713 MachineFunction &MF,
5714 unsigned Intrinsic) const {
5715 Info.flags = MachineMemOperand::MONone;
5716 Info.offset = 0;
5717
5718 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5719 if (!IntrData) {
5720 switch (Intrinsic) {
5721 case Intrinsic::x86_aesenc128kl:
5722 case Intrinsic::x86_aesdec128kl:
5723 Info.opc = ISD::INTRINSIC_W_CHAIN;
5724 Info.ptrVal = I.getArgOperand(1);
5725 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5726 Info.align = Align(1);
5727 Info.flags |= MachineMemOperand::MOLoad;
5728 return true;
5729 case Intrinsic::x86_aesenc256kl:
5730 case Intrinsic::x86_aesdec256kl:
5731 Info.opc = ISD::INTRINSIC_W_CHAIN;
5732 Info.ptrVal = I.getArgOperand(1);
5733 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5734 Info.align = Align(1);
5735 Info.flags |= MachineMemOperand::MOLoad;
5736 return true;
5737 case Intrinsic::x86_aesencwide128kl:
5738 case Intrinsic::x86_aesdecwide128kl:
5739 Info.opc = ISD::INTRINSIC_W_CHAIN;
5740 Info.ptrVal = I.getArgOperand(0);
5741 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5742 Info.align = Align(1);
5743 Info.flags |= MachineMemOperand::MOLoad;
5744 return true;
5745 case Intrinsic::x86_aesencwide256kl:
5746 case Intrinsic::x86_aesdecwide256kl:
5747 Info.opc = ISD::INTRINSIC_W_CHAIN;
5748 Info.ptrVal = I.getArgOperand(0);
5749 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5750 Info.align = Align(1);
5751 Info.flags |= MachineMemOperand::MOLoad;
5752 return true;
5753 case Intrinsic::x86_cmpccxadd32:
5754 case Intrinsic::x86_cmpccxadd64:
5755 case Intrinsic::x86_atomic_bts:
5756 case Intrinsic::x86_atomic_btc:
5757 case Intrinsic::x86_atomic_btr: {
5758 Info.opc = ISD::INTRINSIC_W_CHAIN;
5759 Info.ptrVal = I.getArgOperand(0);
5760 unsigned Size = I.getType()->getScalarSizeInBits();
5761 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5762 Info.align = Align(Size);
5763 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5764 MachineMemOperand::MOVolatile;
5765 return true;
5766 }
5767 case Intrinsic::x86_atomic_bts_rm:
5768 case Intrinsic::x86_atomic_btc_rm:
5769 case Intrinsic::x86_atomic_btr_rm: {
5770 Info.opc = ISD::INTRINSIC_W_CHAIN;
5771 Info.ptrVal = I.getArgOperand(0);
5772 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5773 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5774 Info.align = Align(Size);
5775 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5776 MachineMemOperand::MOVolatile;
5777 return true;
5778 }
5779 case Intrinsic::x86_aadd32:
5780 case Intrinsic::x86_aadd64:
5781 case Intrinsic::x86_aand32:
5782 case Intrinsic::x86_aand64:
5783 case Intrinsic::x86_aor32:
5784 case Intrinsic::x86_aor64:
5785 case Intrinsic::x86_axor32:
5786 case Intrinsic::x86_axor64:
5787 case Intrinsic::x86_atomic_add_cc:
5788 case Intrinsic::x86_atomic_sub_cc:
5789 case Intrinsic::x86_atomic_or_cc:
5790 case Intrinsic::x86_atomic_and_cc:
5791 case Intrinsic::x86_atomic_xor_cc: {
5792 Info.opc = ISD::INTRINSIC_W_CHAIN;
5793 Info.ptrVal = I.getArgOperand(0);
5794 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5795 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5796 Info.align = Align(Size);
5797 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5798 MachineMemOperand::MOVolatile;
5799 return true;
5800 }
5801 }
5802 return false;
5803 }
5804
5805 switch (IntrData->Type) {
5806 case TRUNCATE_TO_MEM_VI8:
5807 case TRUNCATE_TO_MEM_VI16:
5808 case TRUNCATE_TO_MEM_VI32: {
5809 Info.opc = ISD::INTRINSIC_VOID;
5810 Info.ptrVal = I.getArgOperand(0);
5811 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5812 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5813 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5814 ScalarVT = MVT::i8;
5815 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5816 ScalarVT = MVT::i16;
5817 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5818 ScalarVT = MVT::i32;
5819
5820 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5821 Info.align = Align(1);
5822 Info.flags |= MachineMemOperand::MOStore;
5823 break;
5824 }
5825 case GATHER:
5826 case GATHER_AVX2: {
5827 Info.opc = ISD::INTRINSIC_W_CHAIN;
5828 Info.ptrVal = nullptr;
5829 MVT DataVT = MVT::getVT(I.getType());
5830 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5831 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5832 IndexVT.getVectorNumElements());
5833 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5834 Info.align = Align(1);
5835 Info.flags |= MachineMemOperand::MOLoad;
5836 break;
5837 }
5838 case SCATTER: {
5839 Info.opc = ISD::INTRINSIC_VOID;
5840 Info.ptrVal = nullptr;
5841 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5842 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5843 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5844 IndexVT.getVectorNumElements());
5845 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5846 Info.align = Align(1);
5847 Info.flags |= MachineMemOperand::MOStore;
5848 break;
5849 }
5850 default:
5851 return false;
5852 }
5853
5854 return true;
5855}
5856
5857/// Returns true if the target can instruction select the
5858/// specified FP immediate natively. If false, the legalizer will
5859/// materialize the FP immediate as a load from a constant pool.
5860bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5861 bool ForCodeSize) const {
5862 for (const APFloat &FPImm : LegalFPImmediates)
5863 if (Imm.bitwiseIsEqual(FPImm))
5864 return true;
5865 return false;
5866}
5867
5868bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5869 ISD::LoadExtType ExtTy,
5870 EVT NewVT) const {
5871 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5871, __extension__
__PRETTY_FUNCTION__))
;
5872
5873 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5874 // relocation target a movq or addq instruction: don't let the load shrink.
5875 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5876 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5877 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5878 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5879
5880 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5881 // those uses are extracted directly into a store, then the extract + store
5882 // can be store-folded. Therefore, it's probably not worth splitting the load.
5883 EVT VT = Load->getValueType(0);
5884 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5885 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5886 // Skip uses of the chain value. Result 0 of the node is the load value.
5887 if (UI.getUse().getResNo() != 0)
5888 continue;
5889
5890 // If this use is not an extract + store, it's probably worth splitting.
5891 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5892 UI->use_begin()->getOpcode() != ISD::STORE)
5893 return true;
5894 }
5895 // All non-chain uses are extract + store.
5896 return false;
5897 }
5898
5899 return true;
5900}
5901
5902/// Returns true if it is beneficial to convert a load of a constant
5903/// to just the constant itself.
5904bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5905 Type *Ty) const {
5906 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5906, __extension__ __PRETTY_FUNCTION__))
;
5907
5908 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5909 if (BitSize == 0 || BitSize > 64)
5910 return false;
5911 return true;
5912}
5913
5914bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5915 // If we are using XMM registers in the ABI and the condition of the select is
5916 // a floating-point compare and we have blendv or conditional move, then it is
5917 // cheaper to select instead of doing a cross-register move and creating a
5918 // load that depends on the compare result.
5919 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5920 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5921}
5922
5923bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5924 // TODO: It might be a win to ease or lift this restriction, but the generic
5925 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5926 if (VT.isVector() && Subtarget.hasAVX512())
5927 return false;
5928
5929 return true;
5930}
5931
5932bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5933 SDValue C) const {
5934 // TODO: We handle scalars using custom code, but generic combining could make
5935 // that unnecessary.
5936 APInt MulC;
5937 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5938 return false;
5939
5940 // Find the type this will be legalized too. Otherwise we might prematurely
5941 // convert this to shl+add/sub and then still have to type legalize those ops.
5942 // Another choice would be to defer the decision for illegal types until
5943 // after type legalization. But constant splat vectors of i64 can't make it
5944 // through type legalization on 32-bit targets so we would need to special
5945 // case vXi64.
5946 while (getTypeAction(Context, VT) != TypeLegal)
5947 VT = getTypeToTransformTo(Context, VT);
5948
5949 // If vector multiply is legal, assume that's faster than shl + add/sub.
5950 // Multiply is a complex op with higher latency and lower throughput in
5951 // most implementations, sub-vXi32 vector multiplies are always fast,
5952 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5953 // is always going to be slow.
5954 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5955 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5956 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5957 return false;
5958
5959 // shl+add, shl+sub, shl+add+neg
5960 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5961 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5962}
5963
5964bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5965 unsigned Index) const {
5966 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5967 return false;
5968
5969 // Mask vectors support all subregister combinations and operations that
5970 // extract half of vector.
5971 if (ResVT.getVectorElementType() == MVT::i1)
5972 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5973 (Index == ResVT.getVectorNumElements()));
5974
5975 return (Index % ResVT.getVectorNumElements()) == 0;
5976}
5977
5978bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5979 unsigned Opc = VecOp.getOpcode();
5980
5981 // Assume target opcodes can't be scalarized.
5982 // TODO - do we have any exceptions?
5983 if (Opc >= ISD::BUILTIN_OP_END)
5984 return false;
5985
5986 // If the vector op is not supported, try to convert to scalar.
5987 EVT VecVT = VecOp.getValueType();
5988 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5989 return true;
5990
5991 // If the vector op is supported, but the scalar op is not, the transform may
5992 // not be worthwhile.
5993 EVT ScalarVT = VecVT.getScalarType();
5994 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5995}
5996
5997bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5998 bool) const {
5999 // TODO: Allow vectors?
6000 if (VT.isVector())
6001 return false;
6002 return VT.isSimple() || !isOperationExpand(Opcode, VT);
6003}
6004
6005bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
6006 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
6007 return Subtarget.hasBMI() ||
6008 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
6009}
6010
6011bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
6012 // Speculate ctlz only if we can directly use LZCNT.
6013 return Subtarget.hasLZCNT();
6014}
6015
6016bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
6017 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
6018 // expensive than a straight movsd. On the other hand, it's important to
6019 // shrink long double fp constant since fldt is very slow.
6020 return !Subtarget.hasSSE2() || VT == MVT::f80;
6021}
6022
6023bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
6024 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
6025 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
6026}
6027
6028bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
6029 const SelectionDAG &DAG,
6030 const MachineMemOperand &MMO) const {
6031 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
6032 BitcastVT.getVectorElementType() == MVT::i1)
6033 return false;
6034
6035 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
6036 return false;
6037
6038 // If both types are legal vectors, it's always ok to convert them.
6039 if (LoadVT.isVector() && BitcastVT.isVector() &&
6040 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
6041 return true;
6042
6043 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
6044}
6045
6046bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
6047 const MachineFunction &MF) const {
6048 // Do not merge to float value size (128 bytes) if no implicit
6049 // float attribute is set.
6050 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
6051
6052 if (NoFloat) {
6053 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
6054 return (MemVT.getSizeInBits() <= MaxIntSize);
6055 }
6056 // Make sure we don't merge greater than our preferred vector
6057 // width.
6058 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
6059 return false;
6060
6061 return true;
6062}
6063
6064bool X86TargetLowering::isCtlzFast() const {
6065 return Subtarget.hasFastLZCNT();
6066}
6067
6068bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
6069 const Instruction &AndI) const {
6070 return true;
6071}
6072
6073bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
6074 EVT VT = Y.getValueType();
6075
6076 if (VT.isVector())
6077 return false;
6078
6079 if (!Subtarget.hasBMI())
6080 return false;
6081
6082 // There are only 32-bit and 64-bit forms for 'andn'.
6083 if (VT != MVT::i32 && VT != MVT::i64)
6084 return false;
6085
6086 return !isa<ConstantSDNode>(Y);
6087}
6088
6089bool X86TargetLowering::hasAndNot(SDValue Y) const {
6090 EVT VT = Y.getValueType();
6091
6092 if (!VT.isVector())
6093 return hasAndNotCompare(Y);
6094
6095 // Vector.
6096
6097 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
6098 return false;
6099
6100 if (VT == MVT::v4i32)
6101 return true;
6102
6103 return Subtarget.hasSSE2();
6104}
6105
6106bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
6107 return X.getValueType().isScalarInteger(); // 'bt'
6108}
6109
6110bool X86TargetLowering::
6111 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6112 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
6113 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
6114 SelectionDAG &DAG) const {
6115 // Does baseline recommend not to perform the fold by default?
6116 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6117 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
6118 return false;
6119 // For scalars this transform is always beneficial.
6120 if (X.getValueType().isScalarInteger())
6121 return true;
6122 // If all the shift amounts are identical, then transform is beneficial even
6123 // with rudimentary SSE2 shifts.
6124 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
6125 return true;
6126 // If we have AVX2 with it's powerful shift operations, then it's also good.
6127 if (Subtarget.hasAVX2())
6128 return true;
6129 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
6130 return NewShiftOpcode == ISD::SHL;
6131}
6132
6133bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
6134 return N->getOpcode() != ISD::FP_EXTEND;
6135}
6136
6137bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
6138 const SDNode *N, CombineLevel Level) const {
6139 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6140 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6141 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6142 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
6143 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))
;
6144 // TODO: Should we always create i64 masks? Or only folded immediates?
6145 EVT VT = N->getValueType(0);
6146 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
6147 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
6148 // Only fold if the shift values are equal - so it folds to AND.
6149 // TODO - we should fold if either is a non-uniform vector but we don't do
6150 // the fold for non-splats yet.
6151 return N->getOperand(1) == N->getOperand(0).getOperand(1);
6152 }
6153 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
6154}
6155
6156bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
6157 EVT VT = Y.getValueType();
6158
6159 // For vectors, we don't have a preference, but we probably want a mask.
6160 if (VT.isVector())
6161 return false;
6162
6163 // 64-bit shifts on 32-bit targets produce really bad bloated code.
6164 if (VT == MVT::i64 && !Subtarget.is64Bit())
6165 return false;
6166
6167 return true;
6168}
6169
6170TargetLowering::ShiftLegalizationStrategy
6171X86TargetLowering::preferredShiftLegalizationStrategy(
6172 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
6173 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
6174 !Subtarget.isOSWindows())
6175 return ShiftLegalizationStrategy::LowerToLibcall;
6176 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
6177 ExpansionFactor);
6178}
6179
6180bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
6181 // Any legal vector type can be splatted more efficiently than
6182 // loading/spilling from memory.
6183 return isTypeLegal(VT);
6184}
6185
6186MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
6187 MVT VT = MVT::getIntegerVT(NumBits);
6188 if (isTypeLegal(VT))
6189 return VT;
6190
6191 // PMOVMSKB can handle this.
6192 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
6193 return MVT::v16i8;
6194
6195 // VPMOVMSKB can handle this.
6196 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
6197 return MVT::v32i8;
6198
6199 // TODO: Allow 64-bit type for 32-bit target.
6200 // TODO: 512-bit types should be allowed, but make sure that those
6201 // cases are handled in combineVectorSizedSetCCEquality().
6202
6203 return MVT::INVALID_SIMPLE_VALUE_TYPE;
6204}
6205
6206/// Val is the undef sentinel value or equal to the specified value.
6207static bool isUndefOrEqual(int Val, int CmpVal) {
6208 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
6209}
6210
6211/// Return true if every element in Mask is the undef sentinel value or equal to
6212/// the specified value..
6213static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
6214 return llvm::all_of(Mask, [CmpVal](int M) {
6215 return (M == SM_SentinelUndef) || (M == CmpVal);
6216 });
6217}
6218
6219/// Val is either the undef or zero sentinel value.
6220static bool isUndefOrZero(int Val) {
6221 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
6222}
6223
6224/// Return true if every element in Mask, beginning from position Pos and ending
6225/// in Pos+Size is the undef sentinel value.
6226static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
6227 return llvm::all_of(Mask.slice(Pos, Size),
6228 [](int M) { return M == SM_SentinelUndef; });
6229}
6230
6231/// Return true if the mask creates a vector whose lower half is undefined.
6232static bool isUndefLowerHalf(ArrayRef<int> Mask) {
6233 unsigned NumElts = Mask.size();
6234 return isUndefInRange(Mask, 0, NumElts / 2);
6235}
6236
6237/// Return true if the mask creates a vector whose upper half is undefined.
6238static bool isUndefUpperHalf(ArrayRef<int> Mask) {
6239 unsigned NumElts = Mask.size();
6240 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
6241}
6242
6243/// Return true if Val falls within the specified range (L, H].
6244static bool isInRange(int Val, int Low, int Hi) {
6245 return (Val >= Low && Val < Hi);
6246}
6247
6248/// Return true if the value of any element in Mask falls within the specified
6249/// range (L, H].
6250static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
6251 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
6252}
6253
6254/// Return true if the value of any element in Mask is the zero sentinel value.
6255static bool isAnyZero(ArrayRef<int> Mask) {
6256 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
6257}
6258
6259/// Return true if the value of any element in Mask is the zero or undef
6260/// sentinel values.
6261static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
6262 return llvm::any_of(Mask, [](int M) {
6263 return M == SM_SentinelZero || M == SM_SentinelUndef;
6264 });
6265}
6266
6267/// Return true if Val is undef or if its value falls within the
6268/// specified range (L, H].
6269static bool isUndefOrInRange(int Val, int Low, int Hi) {
6270 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
6271}
6272
6273/// Return true if every element in Mask is undef or if its value
6274/// falls within the specified range (L, H].
6275static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6276 return llvm::all_of(
6277 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
6278}
6279
6280/// Return true if Val is undef, zero or if its value falls within the
6281/// specified range (L, H].
6282static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
6283 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
6284}
6285
6286/// Return true if every element in Mask is undef, zero or if its value
6287/// falls within the specified range (L, H].
6288static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6289 return llvm::all_of(
6290 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
6291}
6292
6293/// Return true if every element in Mask, beginning
6294/// from position Pos and ending in Pos + Size, falls within the specified
6295/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
6296static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
6297 unsigned Size, int Low, int Step = 1) {
6298 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6299 if (!isUndefOrEqual(Mask[i], Low))
6300 return false;
6301 return true;
6302}
6303
6304/// Return true if every element in Mask, beginning
6305/// from position Pos and ending in Pos+Size, falls within the specified
6306/// sequential range (Low, Low+Size], or is undef or is zero.
6307static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6308 unsigned Size, int Low,
6309 int Step = 1) {
6310 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6311 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
6312 return false;
6313 return true;
6314}
6315
6316/// Return true if every element in Mask, beginning
6317/// from position Pos and ending in Pos+Size is undef or is zero.
6318static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6319 unsigned Size) {
6320 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
6321}
6322
6323/// Helper function to test whether a shuffle mask could be
6324/// simplified by widening the elements being shuffled.
6325///
6326/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
6327/// leaves it in an unspecified state.
6328///
6329/// NOTE: This must handle normal vector shuffle masks and *target* vector
6330/// shuffle masks. The latter have the special property of a '-2' representing
6331/// a zero-ed lane of a vector.
6332static bool canWidenShuffleElements(ArrayRef<int> Mask,
6333 SmallVectorImpl<int> &WidenedMask) {
6334 WidenedMask.assign(Mask.size() / 2, 0);
6335 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
6336 int M0 = Mask[i];
6337 int M1 = Mask[i + 1];
6338
6339 // If both elements are undef, its trivial.
6340 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
6341 WidenedMask[i / 2] = SM_SentinelUndef;
6342 continue;
6343 }
6344
6345 // Check for an undef mask and a mask value properly aligned to fit with
6346 // a pair of values. If we find such a case, use the non-undef mask's value.
6347 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6348 WidenedMask[i / 2] = M1 / 2;
6349 continue;
6350 }
6351 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6352 WidenedMask[i / 2] = M0 / 2;
6353 continue;
6354 }
6355
6356 // When zeroing, we need to spread the zeroing across both lanes to widen.
6357 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6358 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6359 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6360 WidenedMask[i / 2] = SM_SentinelZero;
6361 continue;
6362 }
6363 return false;
6364 }
6365
6366 // Finally check if the two mask values are adjacent and aligned with
6367 // a pair.
6368 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6369 WidenedMask[i / 2] = M0 / 2;
6370 continue;
6371 }
6372
6373 // Otherwise we can't safely widen the elements used in this shuffle.
6374 return false;
6375 }
6376 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__
__PRETTY_FUNCTION__))
6377 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__
__PRETTY_FUNCTION__))
;
6378
6379 return true;
6380}
6381
6382static bool canWidenShuffleElements(ArrayRef<int> Mask,
6383 const APInt &Zeroable,
6384 bool V2IsZero,
6385 SmallVectorImpl<int> &WidenedMask) {
6386 // Create an alternative mask with info about zeroable elements.
6387 // Here we do not set undef elements as zeroable.
6388 SmallVector<int, 64> ZeroableMask(Mask);
6389 if (V2IsZero) {
6390 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6390, __extension__
__PRETTY_FUNCTION__))
;
6391 for (int i = 0, Size = Mask.size(); i != Size; ++i)
6392 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6393 ZeroableMask[i] = SM_SentinelZero;
6394 }
6395 return canWidenShuffleElements(ZeroableMask, WidenedMask);
6396}
6397
6398static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6399 SmallVector<int, 32> WidenedMask;
6400 return canWidenShuffleElements(Mask, WidenedMask);
6401}
6402
6403// Attempt to narrow/widen shuffle mask until it matches the target number of
6404// elements.
6405static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6406 SmallVectorImpl<int> &ScaledMask) {
6407 unsigned NumSrcElts = Mask.size();
6408 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__
__PRETTY_FUNCTION__))
6409 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__
__PRETTY_FUNCTION__))
;
6410
6411 // Narrowing is guaranteed to work.
6412 if (NumDstElts >= NumSrcElts) {
6413 int Scale = NumDstElts / NumSrcElts;
6414 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6415 return true;
6416 }
6417
6418 // We have to repeat the widening until we reach the target size, but we can
6419 // split out the first widening as it sets up ScaledMask for us.
6420 if (canWidenShuffleElements(Mask, ScaledMask)) {
6421 while (ScaledMask.size() > NumDstElts) {
6422 SmallVector<int, 16> WidenedMask;
6423 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6424 return false;
6425 ScaledMask = std::move(WidenedMask);
6426 }
6427 return true;
6428 }
6429
6430 return false;
6431}
6432
6433/// Returns true if Elt is a constant zero or a floating point constant +0.0.
6434bool X86::isZeroNode(SDValue Elt) {
6435 return isNullConstant(Elt) || isNullFPConstant(Elt);
6436}
6437
6438// Build a vector of constants.
6439// Use an UNDEF node if MaskElt == -1.
6440// Split 64-bit constants in the 32-bit mode.
6441static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6442 const SDLoc &dl, bool IsMask = false) {
6443
6444 SmallVector<SDValue, 32> Ops;
6445 bool Split = false;
6446
6447 MVT ConstVecVT = VT;
6448 unsigned NumElts = VT.getVectorNumElements();
6449 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6450 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6451 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6452 Split = true;
6453 }
6454
6455 MVT EltVT = ConstVecVT.getVectorElementType();
6456 for (unsigned i = 0; i < NumElts; ++i) {
6457 bool IsUndef = Values[i] < 0 && IsMask;
6458 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6459 DAG.getConstant(Values[i], dl, EltVT);
6460 Ops.push_back(OpNode);
6461 if (Split)
6462 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6463 DAG.getConstant(0, dl, EltVT));
6464 }
6465 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6466 if (Split)
6467 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6468 return ConstsNode;
6469}
6470
6471static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
6472 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6473 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__
__PRETTY_FUNCTION__))
6474 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__
__PRETTY_FUNCTION__))
;
6475 SmallVector<SDValue, 32> Ops;
6476 bool Split = false;
6477
6478 MVT ConstVecVT = VT;
6479 unsigned NumElts = VT.getVectorNumElements();
6480 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6481 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6482 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6483 Split = true;
6484 }
6485
6486 MVT EltVT = ConstVecVT.getVectorElementType();
6487 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6488 if (Undefs[i]) {
6489 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6490 continue;
6491 }
6492 const APInt &V = Bits[i];
6493 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6493, __extension__
__PRETTY_FUNCTION__))
;
6494 if (Split) {
6495 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6496 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6497 } else if (EltVT == MVT::f32) {
6498 APFloat FV(APFloat::IEEEsingle(), V);
6499 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6500 } else if (EltVT == MVT::f64) {
6501 APFloat FV(APFloat::IEEEdouble(), V);
6502 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6503 } else {
6504 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6505 }
6506 }
6507
6508 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6509 return DAG.getBitcast(VT, ConstsNode);
6510}
6511
6512static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
6513 SelectionDAG &DAG, const SDLoc &dl) {
6514 APInt Undefs = APInt::getZero(Bits.size());
6515 return getConstVector(Bits, Undefs, VT, DAG, dl);
6516}
6517
6518/// Returns a vector of specified type with all zero elements.
6519static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6520 SelectionDAG &DAG, const SDLoc &dl) {
6521 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))
6522 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))
6523 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))
;
6524
6525 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6526 // type. This ensures they get CSE'd. But if the integer type is not
6527 // available, use a floating-point +0.0 instead.
6528 SDValue Vec;
6529 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6530 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6531 } else if (VT.isFloatingPoint()) {
6532 Vec = DAG.getConstantFP(+0.0, dl, VT);
6533 } else if (VT.getVectorElementType() == MVT::i1) {
6534 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__
__PRETTY_FUNCTION__))
6535 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__
__PRETTY_FUNCTION__))
;
6536 Vec = DAG.getConstant(0, dl, VT);
6537 } else {
6538 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6539 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6540 }
6541 return DAG.getBitcast(VT, Vec);
6542}
6543
6544// Helper to determine if the ops are all the extracted subvectors come from a
6545// single source. If we allow commute they don't have to be in order (Lo/Hi).
6546static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6547 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6548 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6549 LHS.getValueType() != RHS.getValueType() ||
6550 LHS.getOperand(0) != RHS.getOperand(0))
6551 return SDValue();
6552
6553 SDValue Src = LHS.getOperand(0);
6554 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6555 return SDValue();
6556
6557 unsigned NumElts = LHS.getValueType().getVectorNumElements();
6558 if ((LHS.getConstantOperandAPInt(1) == 0 &&
6559 RHS.getConstantOperandAPInt(1) == NumElts) ||
6560 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6561 LHS.getConstantOperandAPInt(1) == NumElts))
6562 return Src;
6563
6564 return SDValue();
6565}
6566
6567static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6568 const SDLoc &dl, unsigned vectorWidth) {
6569 EVT VT = Vec.getValueType();
6570 EVT ElVT = VT.getVectorElementType();
6571 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6572 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6573 VT.getVectorNumElements() / Factor);
6574
6575 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6576 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6577 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6577, __extension__
__PRETTY_FUNCTION__))
;
6578
6579 // This is the index of the first element of the vectorWidth-bit chunk
6580 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6581 IdxVal &= ~(ElemsPerChunk - 1);
6582
6583 // If the input is a buildvector just emit a smaller one.
6584 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6585 return DAG.getBuildVector(ResultVT, dl,
6586 Vec->ops().slice(IdxVal, ElemsPerChunk));
6587
6588 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6589 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6590}
6591
6592/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6593/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6594/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6595/// instructions or a simple subregister reference. Idx is an index in the
6596/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6597/// lowering EXTRACT_VECTOR_ELT operations easier.
6598static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6599 SelectionDAG &DAG, const SDLoc &dl) {
6600 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))
6601 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))
;
6602 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6603}
6604
6605/// Generate a DAG to grab 256-bits from a 512-bit vector.
6606static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6607 SelectionDAG &DAG, const SDLoc &dl) {
6608 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6608, __extension__
__PRETTY_FUNCTION__))
;
6609 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6610}
6611
6612static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6613 SelectionDAG &DAG, const SDLoc &dl,
6614 unsigned vectorWidth) {
6615 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__
__PRETTY_FUNCTION__))
6616 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__
__PRETTY_FUNCTION__))
;
6617 // Inserting UNDEF is Result
6618 if (Vec.isUndef())
6619 return Result;
6620 EVT VT = Vec.getValueType();
6621 EVT ElVT = VT.getVectorElementType();
6622 EVT ResultVT = Result.getValueType();
6623
6624 // Insert the relevant vectorWidth bits.
6625 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6626 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6626, __extension__
__PRETTY_FUNCTION__))
;
6627
6628 // This is the index of the first element of the vectorWidth-bit chunk
6629 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6630 IdxVal &= ~(ElemsPerChunk - 1);
6631
6632 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6633 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6634}
6635
6636/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6637/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6638/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6639/// simple superregister reference. Idx is an index in the 128 bits
6640/// we want. It need not be aligned to a 128-bit boundary. That makes
6641/// lowering INSERT_VECTOR_ELT operations easier.
6642static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6643 SelectionDAG &DAG, const SDLoc &dl) {
6644 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6644, __extension__
__PRETTY_FUNCTION__))
;
6645 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6646}
6647
6648/// Widen a vector to a larger size with the same scalar type, with the new
6649/// elements either zero or undef.
6650static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6651 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6652 const SDLoc &dl) {
6653 assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))
6654 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))
6655 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))
;
6656 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6657 : DAG.getUNDEF(VT);
6658 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6659 DAG.getIntPtrConstant(0, dl));
6660}
6661
6662/// Widen a vector to a larger size with the same scalar type, with the new
6663/// elements either zero or undef.
6664static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6665 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6666 const SDLoc &dl, unsigned WideSizeInBits) {
6667 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))
6668 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))
6669 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))
;
6670 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6671 MVT SVT = Vec.getSimpleValueType().getScalarType();
6672 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6673 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6674}
6675
6676// Helper function to collect subvector ops that are concatenated together,
6677// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6678// The subvectors in Ops are guaranteed to be the same type.
6679static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
6680 SelectionDAG &DAG) {
6681 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6681, __extension__
__PRETTY_FUNCTION__))
;
6682
6683 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6684 Ops.append(N->op_begin(), N->op_end());
6685 return true;
6686 }
6687
6688 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6689 SDValue Src = N->getOperand(0);
6690 SDValue Sub = N->getOperand(1);
6691 const APInt &Idx = N->getConstantOperandAPInt(2);
6692 EVT VT = Src.getValueType();
6693 EVT SubVT = Sub.getValueType();
6694
6695 // TODO - Handle more general insert_subvector chains.
6696 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
6697 // insert_subvector(undef, x, lo)
6698 if (Idx == 0 && Src.isUndef()) {
6699 Ops.push_back(Sub);
6700 Ops.push_back(DAG.getUNDEF(SubVT));
6701 return true;
6702 }
6703 if (Idx == (VT.getVectorNumElements() / 2)) {
6704 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6705 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6706 Src.getOperand(1).getValueType() == SubVT &&
6707 isNullConstant(Src.getOperand(2))) {
6708 Ops.push_back(Src.getOperand(1));
6709 Ops.push_back(Sub);
6710 return true;
6711 }
6712 // insert_subvector(x, extract_subvector(x, lo), hi)
6713 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6714 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6715 Ops.append(2, Sub);
6716 return true;
6717 }
6718 // insert_subvector(undef, x, hi)
6719 if (Src.isUndef()) {
6720 Ops.push_back(DAG.getUNDEF(SubVT));
6721 Ops.push_back(Sub);
6722 return true;
6723 }
6724 }
6725 }
6726 }
6727
6728 return false;
6729}
6730
6731static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6732 const SDLoc &dl) {
6733 EVT VT = Op.getValueType();
6734 unsigned NumElems = VT.getVectorNumElements();
6735 unsigned SizeInBits = VT.getSizeInBits();
6736 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__
__PRETTY_FUNCTION__))
6737 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__
__PRETTY_FUNCTION__))
;
6738
6739 // If this is a splat value (with no-undefs) then use the lower subvector,
6740 // which should be a free extraction.
6741 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6742 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6743 return std::make_pair(Lo, Lo);
6744
6745 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6746 return std::make_pair(Lo, Hi);
6747}
6748
6749/// Break an operation into 2 half sized ops and then concatenate the results.
6750static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6751 unsigned NumOps = Op.getNumOperands();
6752 EVT VT = Op.getValueType();
6753 SDLoc dl(Op);
6754
6755 // Extract the LHS Lo/Hi vectors
6756 SmallVector<SDValue> LoOps(NumOps, SDValue());
6757 SmallVector<SDValue> HiOps(NumOps, SDValue());
6758 for (unsigned I = 0; I != NumOps; ++I) {
6759 SDValue SrcOp = Op.getOperand(I);
6760 if (!SrcOp.getValueType().isVector()) {
6761 LoOps[I] = HiOps[I] = SrcOp;
6762 continue;
6763 }
6764 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6765 }
6766
6767 EVT LoVT, HiVT;
6768 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6769 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6770 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6771 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6772}
6773
6774/// Break an unary integer operation into 2 half sized ops and then
6775/// concatenate the result back.
6776static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6777 // Make sure we only try to split 256/512-bit types to avoid creating
6778 // narrow vectors.
6779 EVT VT = Op.getValueType();
6780 (void)VT;
6781 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))
6782 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))
6783 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))
;
6784 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))
6785 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))
6786 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))
;
6787 return splitVectorOp(Op, DAG);
6788}
6789
6790/// Break a binary integer operation into 2 half sized ops and then
6791/// concatenate the result back.
6792static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6793 // Assert that all the types match.
6794 EVT VT = Op.getValueType();
6795 (void)VT;
6796 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__
__PRETTY_FUNCTION__))
6797 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__
__PRETTY_FUNCTION__))
;
6798 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__
__PRETTY_FUNCTION__))
;
6799 return splitVectorOp(Op, DAG);
6800}
6801
6802// Helper for splitting operands of an operation to legal target size and
6803// apply a function on each part.
6804// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6805// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6806// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6807// The argument Builder is a function that will be applied on each split part:
6808// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6809template <typename F>
6810SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6811 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6812 F Builder, bool CheckBWI = true) {
6813 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6813, __extension__
__PRETTY_FUNCTION__))
;
6814 unsigned NumSubs = 1;
6815 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6816 (!CheckBWI && Subtarget.useAVX512Regs())) {
6817 if (VT.getSizeInBits() > 512) {
6818 NumSubs = VT.getSizeInBits() / 512;
6819 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6819, __extension__
__PRETTY_FUNCTION__))
;
6820 }
6821 } else if (Subtarget.hasAVX2()) {
6822 if (VT.getSizeInBits() > 256) {
6823 NumSubs = VT.getSizeInBits() / 256;
6824 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6824, __extension__
__PRETTY_FUNCTION__))
;
6825 }
6826 } else {
6827 if (VT.getSizeInBits() > 128) {
6828 NumSubs = VT.getSizeInBits() / 128;
6829 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6829, __extension__
__PRETTY_FUNCTION__))
;
6830 }
6831 }
6832
6833 if (NumSubs == 1)
6834 return Builder(DAG, DL, Ops);
6835
6836 SmallVector<SDValue, 4> Subs;
6837 for (unsigned i = 0; i != NumSubs; ++i) {
6838 SmallVector<SDValue, 2> SubOps;
6839 for (SDValue Op : Ops) {
6840 EVT OpVT = Op.getValueType();
6841 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6842 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6843 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6844 }
6845 Subs.push_back(Builder(DAG, DL, SubOps));
6846 }
6847 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6848}
6849
6850// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6851// targets.
6852static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6853 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6854 const X86Subtarget &Subtarget) {
6855 assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6855, __extension__
__PRETTY_FUNCTION__))
;
6856 MVT SVT = VT.getScalarType();
6857
6858 // If we have a 32/64 splatted constant, splat it to DstTy to
6859 // encourage a foldable broadcast'd operand.
6860 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6861 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6862 // AVX512 broadcasts 32/64-bit operands.
6863 // TODO: Support float once getAVX512Node is used by fp-ops.
6864 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6865 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6866 return SDValue();
6867 // If we're not widening, don't bother if we're not bitcasting.
6868 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6869 return SDValue();
6870 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6871 APInt SplatValue, SplatUndef;
6872 unsigned SplatBitSize;
6873 bool HasAnyUndefs;
6874 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6875 HasAnyUndefs, OpEltSizeInBits) &&
6876 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6877 return DAG.getConstant(SplatValue, DL, DstVT);
6878 }
6879 return SDValue();
6880 };
6881
6882 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6883
6884 MVT DstVT = VT;
6885 if (Widen)
6886 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6887
6888 // Canonicalize src operands.
6889 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6890 for (SDValue &Op : SrcOps) {
6891 MVT OpVT = Op.getSimpleValueType();
6892 // Just pass through scalar operands.
6893 if (!OpVT.isVector())
6894 continue;
6895 assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6895, __extension__
__PRETTY_FUNCTION__))
;
6896
6897 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6898 Op = BroadcastOp;
6899 continue;
6900 }
6901
6902 // Just widen the subvector by inserting into an undef wide vector.
6903 if (Widen)
6904 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6905 }
6906
6907 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6908
6909 // Perform the 512-bit op then extract the bottom subvector.
6910 if (Widen)
6911 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6912 return Res;
6913}
6914
6915/// Insert i1-subvector to i1-vector.
6916static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6917 const X86Subtarget &Subtarget) {
6918
6919 SDLoc dl(Op);
6920 SDValue Vec = Op.getOperand(0);
6921 SDValue SubVec = Op.getOperand(1);
6922 SDValue Idx = Op.getOperand(2);
6923 unsigned IdxVal = Op.getConstantOperandVal(2);
6924
6925 // Inserting undef is a nop. We can just return the original vector.
6926 if (SubVec.isUndef())
6927 return Vec;
6928
6929 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6930 return Op;
6931
6932 MVT OpVT = Op.getSimpleValueType();
6933 unsigned NumElems = OpVT.getVectorNumElements();
6934 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6935
6936 // Extend to natively supported kshift.
6937 MVT WideOpVT = OpVT;
6938 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6939 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6940
6941 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6942 // if necessary.
6943 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6944 // May need to promote to a legal type.
6945 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6946 DAG.getConstant(0, dl, WideOpVT),
6947 SubVec, Idx);
6948 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6949 }
6950
6951 MVT SubVecVT = SubVec.getSimpleValueType();
6952 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6953 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))
6954 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))
6955 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))
;
6956
6957 SDValue Undef = DAG.getUNDEF(WideOpVT);
6958
6959 if (IdxVal == 0) {
6960 // Zero lower bits of the Vec
6961 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6962 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6963 ZeroIdx);
6964 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6965 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6966 // Merge them together, SubVec should be zero extended.
6967 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6968 DAG.getConstant(0, dl, WideOpVT),
6969 SubVec, ZeroIdx);
6970 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6971 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6972 }
6973
6974 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6975 Undef, SubVec, ZeroIdx);
6976
6977 if (Vec.isUndef()) {
6978 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6978, __extension__
__PRETTY_FUNCTION__))
;
6979 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6980 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6981 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6982 }
6983
6984 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6985 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6985, __extension__
__PRETTY_FUNCTION__))
;
6986 // If upper elements of Vec are known undef, then just shift into place.
6987 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6988 [](SDValue V) { return V.isUndef(); })) {
6989 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6990 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6991 } else {
6992 NumElems = WideOpVT.getVectorNumElements();
6993 unsigned ShiftLeft = NumElems - SubVecNumElems;
6994 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6995 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6996 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6997 if (ShiftRight != 0)
6998 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6999 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7000 }
7001 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7002 }
7003
7004 // Simple case when we put subvector in the upper part
7005 if (IdxVal + SubVecNumElems == NumElems) {
7006 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7007 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
7008 if (SubVecNumElems * 2 == NumElems) {
7009 // Special case, use legal zero extending insert_subvector. This allows
7010 // isel to optimize when bits are known zero.
7011 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
7012 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7013 DAG.getConstant(0, dl, WideOpVT),
7014 Vec, ZeroIdx);
7015 } else {
7016 // Otherwise use explicit shifts to zero the bits.
7017 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7018 Undef, Vec, ZeroIdx);
7019 NumElems = WideOpVT.getVectorNumElements();
7020 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
7021 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
7022 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
7023 }
7024 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7025 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7026 }
7027
7028 // Inserting into the middle is more complicated.
7029
7030 NumElems = WideOpVT.getVectorNumElements();
7031
7032 // Widen the vector if needed.
7033 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
7034
7035 unsigned ShiftLeft = NumElems - SubVecNumElems;
7036 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
7037
7038 // Do an optimization for the the most frequently used types.
7039 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
7040 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
7041 Mask0.flipAllBits();
7042 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
7043 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
7044 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
7045 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7046 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7047 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7048 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7049 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7050
7051 // Reduce to original width if needed.
7052 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7053 }
7054
7055 // Clear the upper bits of the subvector and move it to its insert position.
7056 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7057 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7058 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7059 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7060
7061 // Isolate the bits below the insertion point.
7062 unsigned LowShift = NumElems - IdxVal;
7063 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
7064 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7065 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
7066 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7067
7068 // Isolate the bits after the last inserted bit.
7069 unsigned HighShift = IdxVal + SubVecNumElems;
7070 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
7071 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7072 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
7073 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7074
7075 // Now OR all 3 pieces together.
7076 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
7077 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
7078
7079 // Reduce to original width if needed.
7080 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7081}
7082
7083static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
7084 const SDLoc &dl) {
7085 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7085, __extension__
__PRETTY_FUNCTION__))
;
7086 EVT SubVT = V1.getValueType();
7087 EVT SubSVT = SubVT.getScalarType();
7088 unsigned SubNumElts = SubVT.getVectorNumElements();
7089 unsigned SubVectorWidth = SubVT.getSizeInBits();
7090 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
7091 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
7092 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
7093}
7094
7095/// Returns a vector of specified type with all bits set.
7096/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
7097/// Then bitcast to their original type, ensuring they get CSE'd.
7098static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
7099 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__
__PRETTY_FUNCTION__))
7100 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__
__PRETTY_FUNCTION__))
;
7101
7102 APInt Ones = APInt::getAllOnes(32);
7103 unsigned NumElts = VT.getSizeInBits() / 32;
7104 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
7105 return DAG.getBitcast(VT, Vec);
7106}
7107
7108static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
7109 SDValue In, SelectionDAG &DAG) {
7110 EVT InVT = In.getValueType();
7111 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__
__PRETTY_FUNCTION__))
;
7112 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))
7113 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))
7114 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))
;
7115
7116 // For 256-bit vectors, we only need the lower (128-bit) input half.
7117 // For 512-bit vectors, we only need the lower input half or quarter.
7118 if (InVT.getSizeInBits() > 128) {
7119 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__
__PRETTY_FUNCTION__))
7120 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__
__PRETTY_FUNCTION__))
;
7121 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
7122 In = extractSubVector(In, 0, DAG, DL,
7123 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
7124 InVT = In.getValueType();
7125 }
7126
7127 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
7128 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
7129
7130 return DAG.getNode(Opcode, DL, VT, In);
7131}
7132
7133// Match (xor X, -1) -> X.
7134// Match extract_subvector(xor X, -1) -> extract_subvector(X).
7135// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
7136static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
7137 V = peekThroughBitcasts(V);
7138 if (V.getOpcode() == ISD::XOR &&
7139 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
7140 isAllOnesConstant(V.getOperand(1))))
7141 return V.getOperand(0);
7142 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7143 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
7144 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
7145 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
7146 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
7147 Not, V.getOperand(1));
7148 }
7149 }
7150 SmallVector<SDValue, 2> CatOps;
7151 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
7152 for (SDValue &CatOp : CatOps) {
7153 SDValue NotCat = IsNOT(CatOp, DAG);
7154 if (!NotCat) return SDValue();
7155 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
7156 }
7157 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
7158 }
7159 return SDValue();
7160}
7161
7162void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
7163 bool Lo, bool Unary) {
7164 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__
__PRETTY_FUNCTION__))
7165 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__
__PRETTY_FUNCTION__))
;
7166 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7166, __extension__
__PRETTY_FUNCTION__))
;
7167 int NumElts = VT.getVectorNumElements();
7168 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7169 for (int i = 0; i < NumElts; ++i) {
7170 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7171 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
7172 Pos += (Unary ? 0 : NumElts * (i % 2));
7173 Pos += (Lo ? 0 : NumEltsInLane / 2);
7174 Mask.push_back(Pos);
7175 }
7176}
7177
7178/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
7179/// imposed by AVX and specific to the unary pattern. Example:
7180/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
7181/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
7182void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7183 bool Lo) {
7184 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7184, __extension__
__PRETTY_FUNCTION__))
;
7185 int NumElts = VT.getVectorNumElements();
7186 for (int i = 0; i < NumElts; ++i) {
7187 int Pos = i / 2;
7188 Pos += (Lo ? 0 : NumElts / 2);
7189 Mask.push_back(Pos);
7190 }
7191}
7192
7193// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
7194static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
7195 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
7196 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
7197 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
7198 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
7199 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
7200 int M = Mask[I];
7201 if (M < 0)
7202 continue;
7203 SDValue V = (M < NumElts) ? V1 : V2;
7204 if (V.isUndef())
7205 continue;
7206 Ops[I] = V.getOperand(M % NumElts);
7207 }
7208 return DAG.getBuildVector(VT, dl, Ops);
7209 }
7210
7211 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
7212}
7213
7214/// Returns a vector_shuffle node for an unpackl operation.
7215static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7216 SDValue V1, SDValue V2) {
7217 SmallVector<int, 8> Mask;
7218 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
7219 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7220}
7221
7222/// Returns a vector_shuffle node for an unpackh operation.
7223static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7224 SDValue V1, SDValue V2) {
7225 SmallVector<int, 8> Mask;
7226 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
7227 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7228}
7229
7230/// Returns a node that packs the LHS + RHS nodes together at half width.
7231/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
7232/// TODO: Add subvector splitting if/when we have a need for it.
7233static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
7234 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
7235 bool PackHiHalf = false) {
7236 MVT OpVT = LHS.getSimpleValueType();
7237 unsigned EltSizeInBits = VT.getScalarSizeInBits();
7238 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
7239 assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
7240 VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
7241 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
7242 "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))
;
7243 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__
__PRETTY_FUNCTION__))
7244 "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__
__PRETTY_FUNCTION__))
;
7245
7246 // Rely on vector shuffles for vXi64 -> vXi32 packing.
7247 if (EltSizeInBits == 32) {
7248 SmallVector<int> PackMask;
7249 int Offset = PackHiHalf ? 1 : 0;
7250 int NumElts = VT.getVectorNumElements();
7251 for (int I = 0; I != NumElts; I += 4) {
7252 PackMask.push_back(I + Offset);
7253 PackMask.push_back(I + Offset + 2);
7254 PackMask.push_back(I + Offset + NumElts);
7255 PackMask.push_back(I + Offset + NumElts + 2);
7256 }
7257 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
7258 DAG.getBitcast(VT, RHS), PackMask);
7259 }
7260
7261 // See if we already have sufficient leading bits for PACKSS/PACKUS.
7262 if (!PackHiHalf) {
7263 if (UsePackUS &&
7264 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
7265 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
7266 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7267
7268 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
7269 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
7270 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7271 }
7272
7273 // Fallback to sign/zero extending the requested half and pack.
7274 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
7275 if (UsePackUS) {
7276 if (PackHiHalf) {
7277 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
7278 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
7279 } else {
7280 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
7281 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
7282 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
7283 };
7284 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7285 };
7286
7287 if (!PackHiHalf) {
7288 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
7289 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
7290 }
7291 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
7292 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
7293 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7294}
7295
7296/// Return a vector_shuffle of the specified vector of zero or undef vector.
7297/// This produces a shuffle where the low element of V2 is swizzled into the
7298/// zero/undef vector, landing at element Idx.
7299/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
7300static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
7301 bool IsZero,
7302 const X86Subtarget &Subtarget,
7303 SelectionDAG &DAG) {
7304 MVT VT = V2.getSimpleValueType();
7305 SDValue V1 = IsZero
7306 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
7307 int NumElems = VT.getVectorNumElements();
7308 SmallVector<int, 16> MaskVec(NumElems);
7309 for (int i = 0; i != NumElems; ++i)
7310 // If this is the insertion idx, put the low elt of V2 here.
7311 MaskVec[i] = (i == Idx) ? NumElems : i;
7312 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
7313}
7314
7315static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
7316 if (Ptr.getOpcode() == X86ISD::Wrapper ||
7317 Ptr.getOpcode() == X86ISD::WrapperRIP)
7318 Ptr = Ptr.getOperand(0);
7319
7320 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
7321 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
7322 return nullptr;
7323
7324 return CNode->getConstVal();
7325}
7326
7327static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
7328 if (!Load || !ISD::isNormalLoad(Load))
7329 return nullptr;
7330 return getTargetConstantFromBasePtr(Load->getBasePtr());
7331}
7332
7333static const Constant *getTargetConstantFromNode(SDValue Op) {
7334 Op = peekThroughBitcasts(Op);
7335 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
7336}
7337
7338const Constant *
7339X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7340 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7340, __extension__
__PRETTY_FUNCTION__))
;
7341 return getTargetConstantFromNode(LD);
7342}
7343
7344// Extract raw constant bits from constant pools.
7345static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7346 APInt &UndefElts,
7347 SmallVectorImpl<APInt> &EltBits,
7348 bool AllowWholeUndefs = true,
7349 bool AllowPartialUndefs = true) {
7350 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7350, __extension__
__PRETTY_FUNCTION__))
;
7351
7352 Op = peekThroughBitcasts(Op);
7353
7354 EVT VT = Op.getValueType();
7355 unsigned SizeInBits = VT.getSizeInBits();
7356 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7356, __extension__
__PRETTY_FUNCTION__))
;
7357 unsigned NumElts = SizeInBits / EltSizeInBits;
7358
7359 // Bitcast a source array of element bits to the target size.
7360 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7361 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7362 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7363 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__
__PRETTY_FUNCTION__))
7364 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__
__PRETTY_FUNCTION__))
;
7365
7366 // Don't split if we don't allow undef bits.
7367 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7368 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7369 return false;
7370
7371 // If we're already the right size, don't bother bitcasting.
7372 if (NumSrcElts == NumElts) {
7373 UndefElts = UndefSrcElts;
7374 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7375 return true;
7376 }
7377
7378 // Extract all the undef/constant element data and pack into single bitsets.
7379 APInt UndefBits(SizeInBits, 0);
7380 APInt MaskBits(SizeInBits, 0);
7381
7382 for (unsigned i = 0; i != NumSrcElts; ++i) {
7383 unsigned BitOffset = i * SrcEltSizeInBits;
7384 if (UndefSrcElts[i])
7385 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7386 MaskBits.insertBits(SrcEltBits[i], BitOffset);
7387 }
7388
7389 // Split the undef/constant single bitset data into the target elements.
7390 UndefElts = APInt(NumElts, 0);
7391 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7392
7393 for (unsigned i = 0; i != NumElts; ++i) {
7394 unsigned BitOffset = i * EltSizeInBits;
7395 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7396
7397 // Only treat an element as UNDEF if all bits are UNDEF.
7398 if (UndefEltBits.isAllOnes()) {
7399 if (!AllowWholeUndefs)
7400 return false;
7401 UndefElts.setBit(i);
7402 continue;
7403 }
7404
7405 // If only some bits are UNDEF then treat them as zero (or bail if not
7406 // supported).
7407 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7408 return false;
7409
7410 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7411 }
7412 return true;
7413 };
7414
7415 // Collect constant bits and insert into mask/undef bit masks.
7416 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7417 unsigned UndefBitIndex) {
7418 if (!Cst)
7419 return false;
7420 if (isa<UndefValue>(Cst)) {
7421 Undefs.setBit(UndefBitIndex);
7422 return true;
7423 }
7424 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7425 Mask = CInt->getValue();
7426 return true;
7427 }
7428 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7429 Mask = CFP->getValueAPF().bitcastToAPInt();
7430 return true;
7431 }
7432 return false;
7433 };
7434
7435 // Handle UNDEFs.
7436 if (Op.isUndef()) {
7437 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7438 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7439 return CastBitData(UndefSrcElts, SrcEltBits);
7440 }
7441
7442 // Extract scalar constant bits.
7443 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7444 APInt UndefSrcElts = APInt::getZero(1);
7445 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7446 return CastBitData(UndefSrcElts, SrcEltBits);
7447 }
7448 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7449 APInt UndefSrcElts = APInt::getZero(1);
7450 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7451 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7452 return CastBitData(UndefSrcElts, SrcEltBits);
7453 }
7454
7455 // Extract constant bits from build vector.
7456 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7457 BitVector Undefs;
7458 SmallVector<APInt> SrcEltBits;
7459 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7460 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7461 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
7462 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7463 if (Undefs[I])
7464 UndefSrcElts.setBit(I);
7465 return CastBitData(UndefSrcElts, SrcEltBits);
7466 }
7467 }
7468
7469 // Extract constant bits from constant pool vector.
7470 if (auto *Cst = getTargetConstantFromNode(Op)) {
7471 Type *CstTy = Cst->getType();
7472 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7473 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7474 return false;
7475
7476 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7477 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7478
7479 APInt UndefSrcElts(NumSrcElts, 0);
7480 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7481 for (unsigned i = 0; i != NumSrcElts; ++i)
7482 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7483 UndefSrcElts, i))
7484 return false;
7485
7486 return CastBitData(UndefSrcElts, SrcEltBits);
7487 }
7488
7489 // Extract constant bits from a broadcasted constant pool scalar.
7490 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7491 EltSizeInBits <= VT.getScalarSizeInBits()) {
7492 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7493 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7494 return false;
7495
7496 SDValue Ptr = MemIntr->getBasePtr();
7497 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7498 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7499 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7500
7501 APInt UndefSrcElts(NumSrcElts, 0);
7502 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7503 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7504 if (UndefSrcElts[0])
7505 UndefSrcElts.setBits(0, NumSrcElts);
7506 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7507 return CastBitData(UndefSrcElts, SrcEltBits);
7508 }
7509 }
7510 }
7511
7512 // Extract constant bits from a subvector broadcast.
7513 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7514 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7515 SDValue Ptr = MemIntr->getBasePtr();
7516 // The source constant may be larger than the subvector broadcast,
7517 // ensure we extract the correct subvector constants.
7518 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7519 Type *CstTy = Cst->getType();
7520 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7521 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7522 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7523 (SizeInBits % SubVecSizeInBits) != 0)
7524 return false;
7525 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7526 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7527 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7528 APInt UndefSubElts(NumSubElts, 0);
7529 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7530 APInt(CstEltSizeInBits, 0));
7531 for (unsigned i = 0; i != NumSubElts; ++i) {
7532 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7533 UndefSubElts, i))
7534 return false;
7535 for (unsigned j = 1; j != NumSubVecs; ++j)
7536 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7537 }
7538 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7539 UndefSubElts);
7540 return CastBitData(UndefSubElts, SubEltBits);
7541 }
7542 }
7543
7544 // Extract a rematerialized scalar constant insertion.
7545 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7546 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7547 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7548 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7549 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7550
7551 APInt UndefSrcElts(NumSrcElts, 0);
7552 SmallVector<APInt, 64> SrcEltBits;
7553 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7554 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7555 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7556 return CastBitData(UndefSrcElts, SrcEltBits);
7557 }
7558
7559 // Insert constant bits from a base and sub vector sources.
7560 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7561 // If bitcasts to larger elements we might lose track of undefs - don't
7562 // allow any to be safe.
7563 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7564 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7565
7566 APInt UndefSrcElts, UndefSubElts;
7567 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7568 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7569 UndefSubElts, EltSubBits,
7570 AllowWholeUndefs && AllowUndefs,
7571 AllowPartialUndefs && AllowUndefs) &&
7572 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7573 UndefSrcElts, EltSrcBits,
7574 AllowWholeUndefs && AllowUndefs,
7575 AllowPartialUndefs && AllowUndefs)) {
7576 unsigned BaseIdx = Op.getConstantOperandVal(2);
7577 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7578 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7579 EltSrcBits[BaseIdx + i] = EltSubBits[i];
7580 return CastBitData(UndefSrcElts, EltSrcBits);
7581 }
7582 }
7583
7584 // Extract constant bits from a subvector's source.
7585 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7586 // TODO - support extract_subvector through bitcasts.
7587 if (EltSizeInBits != VT.getScalarSizeInBits())
7588 return false;
7589
7590 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7591 UndefElts, EltBits, AllowWholeUndefs,
7592 AllowPartialUndefs)) {
7593 EVT SrcVT = Op.getOperand(0).getValueType();
7594 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7595 unsigned NumSubElts = VT.getVectorNumElements();
7596 unsigned BaseIdx = Op.getConstantOperandVal(1);
7597 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7598 if ((BaseIdx + NumSubElts) != NumSrcElts)
7599 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7600 if (BaseIdx != 0)
7601 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7602 return true;
7603 }
7604 }
7605
7606 // Extract constant bits from shuffle node sources.
7607 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7608 // TODO - support shuffle through bitcasts.
7609 if (EltSizeInBits != VT.getScalarSizeInBits())
7610 return false;
7611
7612 ArrayRef<int> Mask = SVN->getMask();
7613 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7614 llvm::any_of(Mask, [](int M) { return M < 0; }))
7615 return false;
7616
7617 APInt UndefElts0, UndefElts1;
7618 SmallVector<APInt, 32> EltBits0, EltBits1;
7619 if (isAnyInRange(Mask, 0, NumElts) &&
7620 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7621 UndefElts0, EltBits0, AllowWholeUndefs,
7622 AllowPartialUndefs))
7623 return false;
7624 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7625 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7626 UndefElts1, EltBits1, AllowWholeUndefs,
7627 AllowPartialUndefs))
7628 return false;
7629
7630 UndefElts = APInt::getZero(NumElts);
7631 for (int i = 0; i != (int)NumElts; ++i) {
7632 int M = Mask[i];
7633 if (M < 0) {
7634 UndefElts.setBit(i);
7635 EltBits.push_back(APInt::getZero(EltSizeInBits));
7636 } else if (M < (int)NumElts) {
7637 if (UndefElts0[M])
7638 UndefElts.setBit(i);
7639 EltBits.push_back(EltBits0[M]);
7640 } else {
7641 if (UndefElts1[M - NumElts])
7642 UndefElts.setBit(i);
7643 EltBits.push_back(EltBits1[M - NumElts]);
7644 }
7645 }
7646 return true;
7647 }
7648
7649 return false;
7650}
7651
7652namespace llvm {
7653namespace X86 {
7654bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7655 APInt UndefElts;
7656 SmallVector<APInt, 16> EltBits;
7657 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7658 UndefElts, EltBits, true,
7659 AllowPartialUndefs)) {
7660 int SplatIndex = -1;
7661 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7662 if (UndefElts[i])
7663 continue;
7664 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7665 SplatIndex = -1;
7666 break;
7667 }
7668 SplatIndex = i;
7669 }
7670 if (0 <= SplatIndex) {
7671 SplatVal = EltBits[SplatIndex];
7672 return true;
7673 }
7674 }
7675
7676 return false;
7677}
7678} // namespace X86
7679} // namespace llvm
7680
7681static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7682 unsigned MaskEltSizeInBits,
7683 SmallVectorImpl<uint64_t> &RawMask,
7684 APInt &UndefElts) {
7685 // Extract the raw target constant bits.
7686 SmallVector<APInt, 64> EltBits;
7687 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7688 EltBits, /* AllowWholeUndefs */ true,
7689 /* AllowPartialUndefs */ false))
7690 return false;
7691
7692 // Insert the extracted elements into the mask.
7693 for (const APInt &Elt : EltBits)
7694 RawMask.push_back(Elt.getZExtValue());
7695
7696 return true;
7697}
7698
7699/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7700/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7701/// Note: This ignores saturation, so inputs must be checked first.
7702static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7703 bool Unary, unsigned NumStages = 1) {
7704 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7704, __extension__
__PRETTY_FUNCTION__))
;
7705 unsigned NumElts = VT.getVectorNumElements();
7706 unsigned NumLanes = VT.getSizeInBits() / 128;
7707 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7708 unsigned Offset = Unary ? 0 : NumElts;
7709 unsigned Repetitions = 1u << (NumStages - 1);
7710 unsigned Increment = 1u << NumStages;
7711 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7711, __extension__
__PRETTY_FUNCTION__))
;
7712
7713 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7714 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7715 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7716 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7717 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7718 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7719 }
7720 }
7721}
7722
7723// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7724static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7725 APInt &DemandedLHS, APInt &DemandedRHS) {
7726 int NumLanes = VT.getSizeInBits() / 128;
7727 int NumElts = DemandedElts.getBitWidth();
7728 int NumInnerElts = NumElts / 2;
7729 int NumEltsPerLane = NumElts / NumLanes;
7730 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7731
7732 DemandedLHS = APInt::getZero(NumInnerElts);
7733 DemandedRHS = APInt::getZero(NumInnerElts);
7734
7735 // Map DemandedElts to the packed operands.
7736 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7737 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7738 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7739 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7740 if (DemandedElts[OuterIdx])
7741 DemandedLHS.setBit(InnerIdx);
7742 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7743 DemandedRHS.setBit(InnerIdx);
7744 }
7745 }
7746}
7747
7748// Split the demanded elts of a HADD/HSUB node between its operands.
7749static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7750 APInt &DemandedLHS, APInt &DemandedRHS) {
7751 int NumLanes = VT.getSizeInBits() / 128;
7752 int NumElts = DemandedElts.getBitWidth();
7753 int NumEltsPerLane = NumElts / NumLanes;
7754 int HalfEltsPerLane = NumEltsPerLane / 2;
7755
7756 DemandedLHS = APInt::getZero(NumElts);
7757 DemandedRHS = APInt::getZero(NumElts);
7758
7759 // Map DemandedElts to the horizontal operands.
7760 for (int Idx = 0; Idx != NumElts; ++Idx) {
7761 if (!DemandedElts[Idx])
7762 continue;
7763 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7764 int LocalIdx = Idx % NumEltsPerLane;
7765 if (LocalIdx < HalfEltsPerLane) {
7766 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7767 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7768 } else {
7769 LocalIdx -= HalfEltsPerLane;
7770 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7771 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7772 }
7773 }
7774}
7775
7776/// Calculates the shuffle mask corresponding to the target-specific opcode.
7777/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7778/// operands in \p Ops, and returns true.
7779/// Sets \p IsUnary to true if only one source is used. Note that this will set
7780/// IsUnary for shuffles which use a single input multiple times, and in those
7781/// cases it will adjust the mask to only have indices within that single input.
7782/// It is an error to call this with non-empty Mask/Ops vectors.
7783static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7784 SmallVectorImpl<SDValue> &Ops,
7785 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7786 unsigned NumElems = VT.getVectorNumElements();
7787 unsigned MaskEltSize = VT.getScalarSizeInBits();
7788 SmallVector<uint64_t, 32> RawMask;
7789 APInt RawUndefs;
7790 uint64_t ImmN;
7791
7792 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__
__PRETTY_FUNCTION__))
;
7793 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__))
;
7794
7795 IsUnary = false;
7796 bool IsFakeUnary = false;
7797 switch (N->getOpcode()) {
7798 case X86ISD::BLENDI:
7799 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__))
;
7800 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7800, __extension__
__PRETTY_FUNCTION__))
;
7801 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7802 DecodeBLENDMask(NumElems, ImmN, Mask);
7803 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7804 break;
7805 case X86ISD::SHUFP:
7806 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7806, __extension__
__PRETTY_FUNCTION__))
;
7807 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7807, __extension__
__PRETTY_FUNCTION__))
;
7808 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7809 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7810 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7811 break;
7812 case X86ISD::INSERTPS:
7813 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7813, __extension__
__PRETTY_FUNCTION__))
;
7814 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7814, __extension__
__PRETTY_FUNCTION__))
;
7815 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7816 DecodeINSERTPSMask(ImmN, Mask);
7817 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7818 break;
7819 case X86ISD::EXTRQI:
7820 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__
__PRETTY_FUNCTION__))
;
7821 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7822 isa<ConstantSDNode>(N->getOperand(2))) {
7823 int BitLen = N->getConstantOperandVal(1);
7824 int BitIdx = N->getConstantOperandVal(2);
7825 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7826 IsUnary = true;
7827 }
7828 break;
7829 case X86ISD::INSERTQI:
7830 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7830, __extension__
__PRETTY_FUNCTION__))
;
7831 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7831, __extension__
__PRETTY_FUNCTION__))
;
7832 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7833 isa<ConstantSDNode>(N->getOperand(3))) {
7834 int BitLen = N->getConstantOperandVal(2);
7835 int BitIdx = N->getConstantOperandVal(3);
7836 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7837 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7838 }
7839 break;
7840 case X86ISD::UNPCKH:
7841 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7841, __extension__
__PRETTY_FUNCTION__))
;
7842 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7842, __extension__
__PRETTY_FUNCTION__))
;
7843 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7844 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7845 break;
7846 case X86ISD::UNPCKL:
7847 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__))
;
7848 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7848, __extension__
__PRETTY_FUNCTION__))
;
7849 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7850 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7851 break;
7852 case X86ISD::MOVHLPS:
7853 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7853, __extension__
__PRETTY_FUNCTION__))
;
7854 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7854, __extension__
__PRETTY_FUNCTION__))
;
7855 DecodeMOVHLPSMask(NumElems, Mask);
7856 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7857 break;
7858 case X86ISD::MOVLHPS:
7859 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7859, __extension__
__PRETTY_FUNCTION__))
;
7860 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7860, __extension__
__PRETTY_FUNCTION__))
;
7861 DecodeMOVLHPSMask(NumElems, Mask);
7862 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7863 break;
7864 case X86ISD::VALIGN:
7865 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
7866 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
;
7867 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7867, __extension__
__PRETTY_FUNCTION__))
;
7868 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7868, __extension__
__PRETTY_FUNCTION__))
;
7869 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7870 DecodeVALIGNMask(NumElems, ImmN, Mask);
7871 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7872 Ops.push_back(N->getOperand(1));
7873 Ops.push_back(N->getOperand(0));
7874 break;
7875 case X86ISD::PALIGNR:
7876 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7876, __extension__
__PRETTY_FUNCTION__))
;
7877 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7877, __extension__
__PRETTY_FUNCTION__))
;
7878 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7878, __extension__
__PRETTY_FUNCTION__))
;
7879 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7880 DecodePALIGNRMask(NumElems, ImmN, Mask);
7881 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7882 Ops.push_back(N->getOperand(1));
7883 Ops.push_back(N->getOperand(0));
7884 break;
7885 case X86ISD::VSHLDQ:
7886 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__
__PRETTY_FUNCTION__))
;
7887 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7887, __extension__
__PRETTY_FUNCTION__))
;
7888 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7889 DecodePSLLDQMask(NumElems, ImmN, Mask);
7890 IsUnary = true;
7891 break;
7892 case X86ISD::VSRLDQ:
7893 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7893, __extension__
__PRETTY_FUNCTION__))
;
7894 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7894, __extension__
__PRETTY_FUNCTION__))
;
7895 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7896 DecodePSRLDQMask(NumElems, ImmN, Mask);
7897 IsUnary = true;
7898 break;
7899 case X86ISD::PSHUFD:
7900 case X86ISD::VPERMILPI:
7901 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7901, __extension__
__PRETTY_FUNCTION__))
;
7902 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7903 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7904 IsUnary = true;
7905 break;
7906 case X86ISD::PSHUFHW:
7907 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7907, __extension__
__PRETTY_FUNCTION__))
;
7908 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7909 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7910 IsUnary = true;
7911 break;
7912 case X86ISD::PSHUFLW:
7913 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__
__PRETTY_FUNCTION__))
;
7914 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7915 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7916 IsUnary = true;
7917 break;
7918 case X86ISD::VZEXT_MOVL:
7919 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7919, __extension__
__PRETTY_FUNCTION__))
;
7920 DecodeZeroMoveLowMask(NumElems, Mask);
7921 IsUnary = true;
7922 break;
7923 case X86ISD::VBROADCAST:
7924 // We only decode broadcasts of same-sized vectors, peeking through to
7925 // extracted subvectors is likely to cause hasOneUse issues with
7926 // SimplifyDemandedBits etc.
7927 if (N->getOperand(0).getValueType() == VT) {
7928 DecodeVectorBroadcast(NumElems, Mask);
7929 IsUnary = true;
7930 break;
7931 }
7932 return false;
7933 case X86ISD::VPERMILPV: {
7934 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7934, __extension__
__PRETTY_FUNCTION__))
;
7935 IsUnary = true;
7936 SDValue MaskNode = N->getOperand(1);
7937 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7938 RawUndefs)) {
7939 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7940 break;
7941 }
7942 return false;
7943 }
7944 case X86ISD::PSHUFB: {
7945 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__))
;
7946 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7946, __extension__
__PRETTY_FUNCTION__))
;
7947 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7947, __extension__
__PRETTY_FUNCTION__))
;
7948 IsUnary = true;
7949 SDValue MaskNode = N->getOperand(1);
7950 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7951 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7952 break;
7953 }
7954 return false;
7955 }
7956 case X86ISD::VPERMI:
7957 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__
__PRETTY_FUNCTION__))
;
7958 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7959 DecodeVPERMMask(NumElems, ImmN, Mask);
7960 IsUnary = true;
7961 break;
7962 case X86ISD::MOVSS:
7963 case X86ISD::MOVSD:
7964 case X86ISD::MOVSH:
7965 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7965, __extension__
__PRETTY_FUNCTION__))
;
7966 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7966, __extension__
__PRETTY_FUNCTION__))
;
7967 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7968 break;
7969 case X86ISD::VPERM2X128:
7970 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7970, __extension__
__PRETTY_FUNCTION__))
;
7971 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7971, __extension__
__PRETTY_FUNCTION__))
;
7972 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7973 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7974 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7975 break;
7976 case X86ISD::SHUF128:
7977 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7977, __extension__
__PRETTY_FUNCTION__))
;
7978 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__
__PRETTY_FUNCTION__))
;
7979 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7980 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7981 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7982 break;
7983 case X86ISD::MOVSLDUP:
7984 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7984, __extension__
__PRETTY_FUNCTION__))
;
7985 DecodeMOVSLDUPMask(NumElems, Mask);
7986 IsUnary = true;
7987 break;
7988 case X86ISD::MOVSHDUP:
7989 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7989, __extension__
__PRETTY_FUNCTION__))
;
7990 DecodeMOVSHDUPMask(NumElems, Mask);
7991 IsUnary = true;
7992 break;
7993 case X86ISD::MOVDDUP:
7994 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7994, __extension__
__PRETTY_FUNCTION__))
;
7995 DecodeMOVDDUPMask(NumElems, Mask);
7996 IsUnary = true;
7997 break;
7998 case X86ISD::VPERMIL2: {
7999 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7999, __extension__
__PRETTY_FUNCTION__))
;
8000 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8000, __extension__
__PRETTY_FUNCTION__))
;
8001 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8002 SDValue MaskNode = N->getOperand(2);
8003 SDValue CtrlNode = N->getOperand(3);
8004 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
8005 unsigned CtrlImm = CtrlOp->getZExtValue();
8006 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8007 RawUndefs)) {
8008 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
8009 Mask);
8010 break;
8011 }
8012 }
8013 return false;
8014 }
8015 case X86ISD::VPPERM: {
8016 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8016, __extension__
__PRETTY_FUNCTION__))
;
8017 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8017, __extension__
__PRETTY_FUNCTION__))
;
8018 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8019 SDValue MaskNode = N->getOperand(2);
8020 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
8021 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
8022 break;
8023 }
8024 return false;
8025 }
8026 case X86ISD::VPERMV: {
8027 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8027, __extension__
__PRETTY_FUNCTION__))
;
8028 IsUnary = true;
8029 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
8030 Ops.push_back(N->getOperand(1));
8031 SDValue MaskNode = N->getOperand(0);
8032 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8033 RawUndefs)) {
8034 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
8035 break;
8036 }
8037 return false;
8038 }
8039 case X86ISD::VPERMV3: {
8040 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8040, __extension__
__PRETTY_FUNCTION__))
;
8041 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8041, __extension__
__PRETTY_FUNCTION__))
;
8042 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
8043 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
8044 Ops.push_back(N->getOperand(0));
8045 Ops.push_back(N->getOperand(2));
8046 SDValue MaskNode = N->getOperand(1);
8047 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8048 RawUndefs)) {
8049 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
8050 break;
8051 }
8052 return false;
8053 }
8054 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8054)
;
8055 }
8056
8057 // Empty mask indicates the decode failed.
8058 if (Mask.empty())
8059 return false;
8060
8061 // Check if we're getting a shuffle mask with zero'd elements.
8062 if (!AllowSentinelZero && isAnyZero(Mask))
8063 return false;
8064
8065 // If we have a fake unary shuffle, the shuffle mask is spread across two
8066 // inputs that are actually the same node. Re-map the mask to always point
8067 // into the first input.
8068 if (IsFakeUnary)
8069 for (int &M : Mask)
8070 if (M >= (int)Mask.size())
8071 M -= Mask.size();
8072
8073 // If we didn't already add operands in the opcode-specific code, default to
8074 // adding 1 or 2 operands starting at 0.
8075 if (Ops.empty()) {
8076 Ops.push_back(N->getOperand(0));
8077 if (!IsUnary || IsFakeUnary)
8078 Ops.push_back(N->getOperand(1));
8079 }
8080
8081 return true;
8082}
8083
8084// Wrapper for getTargetShuffleMask with InUnary;
8085static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
8086 SmallVectorImpl<SDValue> &Ops,
8087 SmallVectorImpl<int> &Mask) {
8088 bool IsUnary;
8089 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
8090}
8091
8092/// Compute whether each element of a shuffle is zeroable.
8093///
8094/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8095/// Either it is an undef element in the shuffle mask, the element of the input
8096/// referenced is undef, or the element of the input referenced is known to be
8097/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8098/// as many lanes with this technique as possible to simplify the remaining
8099/// shuffle.
8100static void computeZeroableShuffleElements(ArrayRef<int> Mask,
8101 SDValue V1, SDValue V2,
8102 APInt &KnownUndef, APInt &KnownZero) {
8103 int Size = Mask.size();
8104 KnownUndef = KnownZero = APInt::getZero(Size);
8105
8106 V1 = peekThroughBitcasts(V1);
8107 V2 = peekThroughBitcasts(V2);
8108
8109 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8110 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8111
8112 int VectorSizeInBits = V1.getValueSizeInBits();
8113 int ScalarSizeInBits = VectorSizeInBits / Size;
8114 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8114, __extension__
__PRETTY_FUNCTION__))
;
8115
8116 for (int i = 0; i < Size; ++i) {
8117 int M = Mask[i];
8118 // Handle the easy cases.
8119 if (M < 0) {
8120 KnownUndef.setBit(i);
8121 continue;
8122 }
8123 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8124 KnownZero.setBit(i);
8125 continue;
8126 }
8127
8128 // Determine shuffle input and normalize the mask.
8129 SDValue V = M < Size ? V1 : V2;
8130 M %= Size;
8131
8132 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8133 if (V.getOpcode() != ISD::BUILD_VECTOR)
8134 continue;
8135
8136 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8137 // the (larger) source element must be UNDEF/ZERO.
8138 if ((Size % V.getNumOperands()) == 0) {
8139 int Scale = Size / V->getNumOperands();
8140 SDValue Op = V.getOperand(M / Scale);
8141 if (Op.isUndef())
8142 KnownUndef.setBit(i);
8143 if (X86::isZeroNode(Op))
8144 KnownZero.setBit(i);
8145 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8146 APInt Val = Cst->getAPIntValue();
8147 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8148 if (Val == 0)
8149 KnownZero.setBit(i);
8150 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8151 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8152 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8153 if (Val == 0)
8154 KnownZero.setBit(i);
8155 }
8156 continue;
8157 }
8158
8159 // If the BUILD_VECTOR has more elements then all the (smaller) source
8160 // elements must be UNDEF or ZERO.
8161 if ((V.getNumOperands() % Size) == 0) {
8162 int Scale = V->getNumOperands() / Size;
8163 bool AllUndef = true;
8164 bool AllZero = true;
8165 for (int j = 0; j < Scale; ++j) {
8166 SDValue Op = V.getOperand((M * Scale) + j);
8167 AllUndef &= Op.isUndef();
8168 AllZero &= X86::isZeroNode(Op);
8169 }
8170 if (AllUndef)
8171 KnownUndef.setBit(i);
8172 if (AllZero)
8173 KnownZero.setBit(i);
8174 continue;
8175 }
8176 }
8177}
8178
8179/// Decode a target shuffle mask and inputs and see if any values are
8180/// known to be undef or zero from their inputs.
8181/// Returns true if the target shuffle mask was decoded.
8182/// FIXME: Merge this with computeZeroableShuffleElements?
8183static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
8184 SmallVectorImpl<SDValue> &Ops,
8185 APInt &KnownUndef, APInt &KnownZero) {
8186 bool IsUnary;
8187 if (!isTargetShuffle(N.getOpcode()))
8188 return false;
8189
8190 MVT VT = N.getSimpleValueType();
8191 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
8192 return false;
8193
8194 int Size = Mask.size();
8195 SDValue V1 = Ops[0];
8196 SDValue V2 = IsUnary ? V1 : Ops[1];
8197 KnownUndef = KnownZero = APInt::getZero(Size);
8198
8199 V1 = peekThroughBitcasts(V1);
8200 V2 = peekThroughBitcasts(V2);
8201
8202 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__
__PRETTY_FUNCTION__))
8203 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__
__PRETTY_FUNCTION__))
;
8204 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
8205
8206 // Extract known constant input data.
8207 APInt UndefSrcElts[2];
8208 SmallVector<APInt, 32> SrcEltBits[2];
8209 bool IsSrcConstant[2] = {
8210 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
8211 SrcEltBits[0], true, false),
8212 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
8213 SrcEltBits[1], true, false)};
8214
8215 for (int i = 0; i < Size; ++i) {
8216 int M = Mask[i];
8217
8218 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
8219 if (M < 0) {
8220 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8220, __extension__
__PRETTY_FUNCTION__))
;
8221 if (SM_SentinelUndef == M)
8222 KnownUndef.setBit(i);
8223 if (SM_SentinelZero == M)
8224 KnownZero.setBit(i);
8225 continue;
8226 }
8227
8228 // Determine shuffle input and normalize the mask.
8229 unsigned SrcIdx = M / Size;
8230 SDValue V = M < Size ? V1 : V2;
8231 M %= Size;
8232
8233 // We are referencing an UNDEF input.
8234 if (V.isUndef()) {
8235 KnownUndef.setBit(i);
8236 continue;
8237 }
8238
8239 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
8240 // TODO: We currently only set UNDEF for integer types - floats use the same
8241 // registers as vectors and many of the scalar folded loads rely on the
8242 // SCALAR_TO_VECTOR pattern.
8243 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
8244 (Size % V.getValueType().getVectorNumElements()) == 0) {
8245 int Scale = Size / V.getValueType().getVectorNumElements();
8246 int Idx = M / Scale;
8247 if (Idx != 0 && !VT.isFloatingPoint())
8248 KnownUndef.setBit(i);
8249 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
8250 KnownZero.setBit(i);
8251 continue;
8252 }
8253
8254 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
8255 // base vectors.
8256 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
8257 SDValue Vec = V.getOperand(0);
8258 int NumVecElts = Vec.getValueType().getVectorNumElements();
8259 if (Vec.isUndef() && Size == NumVecElts) {
8260 int Idx = V.getConstantOperandVal(2);
8261 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
8262 if (M < Idx || (Idx + NumSubElts) <= M)
8263 KnownUndef.setBit(i);
8264 }
8265 continue;
8266 }
8267
8268 // Attempt to extract from the source's constant bits.
8269 if (IsSrcConstant[SrcIdx]) {
8270 if (UndefSrcElts[SrcIdx][M])
8271 KnownUndef.setBit(i);
8272 else if (SrcEltBits[SrcIdx][M] == 0)
8273 KnownZero.setBit(i);
8274 }
8275 }
8276
8277 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__
__PRETTY_FUNCTION__))
8278 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__
__PRETTY_FUNCTION__))
;
8279 return true;
8280}
8281
8282// Replace target shuffle mask elements with known undef/zero sentinels.
8283static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
8284 const APInt &KnownUndef,
8285 const APInt &KnownZero,
8286 bool ResolveKnownZeros= true) {
8287 unsigned NumElts = Mask.size();
8288 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__
__PRETTY_FUNCTION__))
8289 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__
__PRETTY_FUNCTION__))
;
8290
8291 for (unsigned i = 0; i != NumElts; ++i) {
8292 if (KnownUndef[i])
8293 Mask[i] = SM_SentinelUndef;
8294 else if (ResolveKnownZeros && KnownZero[i])
8295 Mask[i] = SM_SentinelZero;
8296 }
8297}
8298
8299// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
8300static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
8301 APInt &KnownUndef,
8302 APInt &KnownZero) {
8303 unsigned NumElts = Mask.size();
8304 KnownUndef = KnownZero = APInt::getZero(NumElts);
8305
8306 for (unsigned i = 0; i != NumElts; ++i) {
8307 int M = Mask[i];
8308 if (SM_SentinelUndef == M)
8309 KnownUndef.setBit(i);
8310 if (SM_SentinelZero == M)
8311 KnownZero.setBit(i);
8312 }
8313}
8314
8315// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
8316static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
8317 SDValue Cond, bool IsBLENDV = false) {
8318 EVT CondVT = Cond.getValueType();
8319 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
8320 unsigned NumElts = CondVT.getVectorNumElements();
8321
8322 APInt UndefElts;
8323 SmallVector<APInt, 32> EltBits;
8324 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
8325 true, false))
8326 return false;
8327
8328 Mask.resize(NumElts, SM_SentinelUndef);
8329
8330 for (int i = 0; i != (int)NumElts; ++i) {
8331 Mask[i] = i;
8332 // Arbitrarily choose from the 2nd operand if the select condition element
8333 // is undef.
8334 // TODO: Can we do better by matching patterns such as even/odd?
8335 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
8336 (IsBLENDV && EltBits[i].isNonNegative()))
8337 Mask[i] += NumElts;
8338 }
8339
8340 return true;
8341}
8342
8343// Forward declaration (for getFauxShuffleMask recursive check).
8344static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8345 SmallVectorImpl<SDValue> &Inputs,
8346 SmallVectorImpl<int> &Mask,
8347 const SelectionDAG &DAG, unsigned Depth,
8348 bool ResolveKnownElts);
8349
8350// Attempt to decode ops that could be represented as a shuffle mask.
8351// The decoded shuffle mask may contain a different number of elements to the
8352// destination value type.
8353// TODO: Merge into getTargetShuffleInputs()
8354static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
8355 SmallVectorImpl<int> &Mask,
8356 SmallVectorImpl<SDValue> &Ops,
8357 const SelectionDAG &DAG, unsigned Depth,
8358 bool ResolveKnownElts) {
8359 Mask.clear();
8360 Ops.clear();
8361
8362 MVT VT = N.getSimpleValueType();
8363 unsigned NumElts = VT.getVectorNumElements();
8364 unsigned NumSizeInBits = VT.getSizeInBits();
8365 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8366 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8367 return false;
8368 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8368, __extension__
__PRETTY_FUNCTION__))
;
8369 unsigned NumSizeInBytes = NumSizeInBits / 8;
8370 unsigned NumBytesPerElt = NumBitsPerElt / 8;
8371
8372 unsigned Opcode = N.getOpcode();
8373 switch (Opcode) {
8374 case ISD::VECTOR_SHUFFLE: {
8375 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8376 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8377 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8378 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8379 Ops.push_back(N.getOperand(0));
8380 Ops.push_back(N.getOperand(1));
8381 return true;
8382 }
8383 return false;
8384 }
8385 case ISD::AND:
8386 case X86ISD::ANDNP: {
8387 // Attempt to decode as a per-byte mask.
8388 APInt UndefElts;
8389 SmallVector<APInt, 32> EltBits;
8390 SDValue N0 = N.getOperand(0);
8391 SDValue N1 = N.getOperand(1);
8392 bool IsAndN = (X86ISD::ANDNP == Opcode);
8393 uint64_t ZeroMask = IsAndN ? 255 : 0;
8394 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8395 return false;
8396 // We can't assume an undef src element gives an undef dst - the other src
8397 // might be zero.
8398 if (!UndefElts.isZero())
8399 return false;
8400 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8401 const APInt &ByteBits = EltBits[i];
8402 if (ByteBits != 0 && ByteBits != 255)
8403 return false;
8404 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8405 }
8406 Ops.push_back(IsAndN ? N1 : N0);
8407 return true;
8408 }
8409 case ISD::OR: {
8410 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8411 // is a valid shuffle index.
8412 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8413 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8414 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8415 return false;
8416
8417 SmallVector<int, 64> SrcMask0, SrcMask1;
8418 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8419 APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
8420 APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
8421 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
8422 Depth + 1, true) ||
8423 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
8424 Depth + 1, true))
8425 return false;
8426
8427 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8428 SmallVector<int, 64> Mask0, Mask1;
8429 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8430 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8431 for (int i = 0; i != (int)MaskSize; ++i) {
8432 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8433 // loops converting between OR and BLEND shuffles due to
8434 // canWidenShuffleElements merging away undef elements, meaning we
8435 // fail to recognise the OR as the undef element isn't known zero.
8436 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8437 Mask.push_back(SM_SentinelZero);
8438 else if (Mask1[i] == SM_SentinelZero)
8439 Mask.push_back(i);
8440 else if (Mask0[i] == SM_SentinelZero)
8441 Mask.push_back(i + MaskSize);
8442 else
8443 return false;
8444 }
8445 Ops.push_back(N0);
8446 Ops.push_back(N1);
8447 return true;
8448 }
8449 case ISD::INSERT_SUBVECTOR: {
8450 SDValue Src = N.getOperand(0);
8451 SDValue Sub = N.getOperand(1);
8452 EVT SubVT = Sub.getValueType();
8453 unsigned NumSubElts = SubVT.getVectorNumElements();
8454 if (!N->isOnlyUserOf(Sub.getNode()))
8455 return false;
8456 uint64_t InsertIdx = N.getConstantOperandVal(2);
8457 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8458 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8459 Sub.getOperand(0).getValueType() == VT) {
8460 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8461 for (int i = 0; i != (int)NumElts; ++i)
8462 Mask.push_back(i);
8463 for (int i = 0; i != (int)NumSubElts; ++i)
8464 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8465 Ops.push_back(Src);
8466 Ops.push_back(Sub.getOperand(0));
8467 return true;
8468 }
8469 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8470 SmallVector<int, 64> SubMask;
8471 SmallVector<SDValue, 2> SubInputs;
8472 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
8473 EVT SubSrcVT = SubSrc.getValueType();
8474 if (!SubSrcVT.isVector())
8475 return false;
8476
8477 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
8478 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
8479 Depth + 1, ResolveKnownElts))
8480 return false;
8481
8482 // Subvector shuffle inputs must not be larger than the subvector.
8483 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8484 return SubVT.getFixedSizeInBits() <
8485 SubInput.getValueSizeInBits().getFixedValue();
8486 }))
8487 return false;
8488
8489 if (SubMask.size() != NumSubElts) {
8490 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__
__PRETTY_FUNCTION__))
8491 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__
__PRETTY_FUNCTION__))
;
8492 if ((NumSubElts % SubMask.size()) == 0) {
8493 int Scale = NumSubElts / SubMask.size();
8494 SmallVector<int,64> ScaledSubMask;
8495 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8496 SubMask = ScaledSubMask;
8497 } else {
8498 int Scale = SubMask.size() / NumSubElts;
8499 NumSubElts = SubMask.size();
8500 NumElts *= Scale;
8501 InsertIdx *= Scale;
8502 }
8503 }
8504 Ops.push_back(Src);
8505 Ops.append(SubInputs.begin(), SubInputs.end());
8506 if (ISD::isBuildVectorAllZeros(Src.getNode()))
8507 Mask.append(NumElts, SM_SentinelZero);
8508 else
8509 for (int i = 0; i != (int)NumElts; ++i)
8510 Mask.push_back(i);
8511 for (int i = 0; i != (int)NumSubElts; ++i) {
8512 int M = SubMask[i];
8513 if (0 <= M) {
8514 int InputIdx = M / NumSubElts;
8515 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8516 }
8517 Mask[i + InsertIdx] = M;
8518 }
8519 return true;
8520 }
8521 case X86ISD::PINSRB:
8522 case X86ISD::PINSRW:
8523 case ISD::SCALAR_TO_VECTOR:
8524 case ISD::INSERT_VECTOR_ELT: {
8525 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8526 // vector, for matching src/dst vector types.
8527 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8528
8529 unsigned DstIdx = 0;
8530 if (Opcode != ISD::SCALAR_TO_VECTOR) {
8531 // Check we have an in-range constant insertion index.
8532 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8533 N.getConstantOperandAPInt(2).uge(NumElts))
8534 return false;
8535 DstIdx = N.getConstantOperandVal(2);
8536
8537 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8538 if (X86::isZeroNode(Scl)) {
8539 Ops.push_back(N.getOperand(0));
8540 for (unsigned i = 0; i != NumElts; ++i)
8541 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8542 return true;
8543 }
8544 }
8545
8546 // Peek through trunc/aext/zext.
8547 // TODO: aext shouldn't require SM_SentinelZero padding.
8548 // TODO: handle shift of scalars.
8549 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8550 while (Scl.getOpcode() == ISD::TRUNCATE ||
8551 Scl.getOpcode() == ISD::ANY_EXTEND ||
8552 Scl.getOpcode() == ISD::ZERO_EXTEND) {
8553 Scl = Scl.getOperand(0);
8554 MinBitsPerElt =
8555 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8556 }
8557 if ((MinBitsPerElt % 8) != 0)
8558 return false;
8559
8560 // Attempt to find the source vector the scalar was extracted from.
8561 SDValue SrcExtract;
8562 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8563 Scl.getOpcode() == X86ISD::PEXTRW ||
8564 Scl.getOpcode() == X86ISD::PEXTRB) &&
8565 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8566 SrcExtract = Scl;
8567 }
8568 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8569 return false;
8570
8571 SDValue SrcVec = SrcExtract.getOperand(0);
8572 EVT SrcVT = SrcVec.getValueType();
8573 if (!SrcVT.getScalarType().isByteSized())
8574 return false;
8575 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8576 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8577 unsigned DstByte = DstIdx * NumBytesPerElt;
8578 MinBitsPerElt =
8579 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8580
8581 // Create 'identity' byte level shuffle mask and then add inserted bytes.
8582 if (Opcode == ISD::SCALAR_TO_VECTOR) {
8583 Ops.push_back(SrcVec);
8584 Mask.append(NumSizeInBytes, SM_SentinelUndef);
8585 } else {
8586 Ops.push_back(SrcVec);
8587 Ops.push_back(N.getOperand(0));
8588 for (int i = 0; i != (int)NumSizeInBytes; ++i)
8589 Mask.push_back(NumSizeInBytes + i);
8590 }
8591
8592 unsigned MinBytesPerElts = MinBitsPerElt / 8;
8593 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8594 for (unsigned i = 0; i != MinBytesPerElts; ++i)
8595 Mask[DstByte + i] = SrcByte + i;
8596 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8597 Mask[DstByte + i] = SM_SentinelZero;
8598 return true;
8599 }
8600 case X86ISD::PACKSS:
8601 case X86ISD::PACKUS: {
8602 SDValue N0 = N.getOperand(0);
8603 SDValue N1 = N.getOperand(1);
8604 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))
8605 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))
8606 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))
;
8607
8608 APInt EltsLHS, EltsRHS;
8609 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8610
8611 // If we know input saturation won't happen (or we don't care for particular
8612 // lanes), we can treat this as a truncation shuffle.
8613 bool Offset0 = false, Offset1 = false;
8614 if (Opcode == X86ISD::PACKSS) {
8615 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8616 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8617 (!(N1.isUndef() || EltsRHS.isZero()) &&
8618 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8619 return false;
8620 // We can't easily fold ASHR into a shuffle, but if it was feeding a
8621 // PACKSS then it was likely being used for sign-extension for a
8622 // truncation, so just peek through and adjust the mask accordingly.
8623 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8624 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8625 Offset0 = true;
8626 N0 = N0.getOperand(0);
8627 }
8628 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8629 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8630 Offset1 = true;
8631 N1 = N1.getOperand(0);
8632 }
8633 } else {
8634 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8635 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8636 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8637 (!(N1.isUndef() || EltsRHS.isZero()) &&
8638 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8639 return false;
8640 }
8641
8642 bool IsUnary = (N0 == N1);
8643
8644 Ops.push_back(N0);
8645 if (!IsUnary)
8646 Ops.push_back(N1);
8647
8648 createPackShuffleMask(VT, Mask, IsUnary);
8649
8650 if (Offset0 || Offset1) {
8651 for (int &M : Mask)
8652 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8653 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8654 ++M;
8655 }
8656 return true;
8657 }
8658 case ISD::VSELECT:
8659 case X86ISD::BLENDV: {
8660 SDValue Cond = N.getOperand(0);
8661 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
8662 Ops.push_back(N.getOperand(1));
8663 Ops.push_back(N.getOperand(2));
8664 return true;
8665 }
8666 return false;
8667 }
8668 case X86ISD::VTRUNC: {
8669 SDValue Src = N.getOperand(0);
8670 EVT SrcVT = Src.getValueType();
8671 // Truncated source must be a simple vector.
8672 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8673 (SrcVT.getScalarSizeInBits() % 8) != 0)
8674 return false;
8675 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8676 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8677 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8678 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8678, __extension__
__PRETTY_FUNCTION__))
;
8679 for (unsigned i = 0; i != NumSrcElts; ++i)
8680 Mask.push_back(i * Scale);
8681 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8682 Ops.push_back(Src);
8683 return true;
8684 }
8685 case X86ISD::VSHLI:
8686 case X86ISD::VSRLI: {
8687 uint64_t ShiftVal = N.getConstantOperandVal(1);
8688 // Out of range bit shifts are guaranteed to be zero.
8689 if (NumBitsPerElt <= ShiftVal) {
8690 Mask.append(NumElts, SM_SentinelZero);
8691 return true;
8692 }
8693
8694 // We can only decode 'whole byte' bit shifts as shuffles.
8695 if ((ShiftVal % 8) != 0)
8696 break;
8697
8698 uint64_t ByteShift = ShiftVal / 8;
8699 Ops.push_back(N.getOperand(0));
8700
8701 // Clear mask to all zeros and insert the shifted byte indices.
8702 Mask.append(NumSizeInBytes, SM_SentinelZero);
8703
8704 if (X86ISD::VSHLI == Opcode) {
8705 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8706 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8707 Mask[i + j] = i + j - ByteShift;
8708 } else {
8709 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8710 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8711 Mask[i + j - ByteShift] = i + j;
8712 }
8713 return true;
8714 }
8715 case X86ISD::VROTLI:
8716 case X86ISD::VROTRI: {
8717 // We can only decode 'whole byte' bit rotates as shuffles.
8718 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8719 if ((RotateVal % 8) != 0)
8720 return false;
8721 Ops.push_back(N.getOperand(0));
8722 int Offset = RotateVal / 8;
8723 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8724 for (int i = 0; i != (int)NumElts; ++i) {
8725 int BaseIdx = i * NumBytesPerElt;
8726 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8727 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8728 }
8729 }
8730 return true;
8731 }
8732 case X86ISD::VBROADCAST: {
8733 SDValue Src = N.getOperand(0);
8734 if (!Src.getSimpleValueType().isVector()) {
8735 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8736 !isNullConstant(Src.getOperand(1)) ||
8737 Src.getOperand(0).getValueType().getScalarType() !=
8738 VT.getScalarType())
8739 return false;
8740 Src = Src.getOperand(0);
8741 }
8742 Ops.push_back(Src);
8743 Mask.append(NumElts, 0);
8744 return true;
8745 }
8746 case ISD::ZERO_EXTEND:
8747 case ISD::ANY_EXTEND:
8748 case ISD::ZERO_EXTEND_VECTOR_INREG:
8749 case ISD::ANY_EXTEND_VECTOR_INREG: {
8750 SDValue Src = N.getOperand(0);
8751 EVT SrcVT = Src.getValueType();
8752
8753 // Extended source must be a simple vector.
8754 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8755 (SrcVT.getScalarSizeInBits() % 8) != 0)
8756 return false;
8757
8758 bool IsAnyExtend =
8759 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8760 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8761 IsAnyExtend, Mask);
8762 Ops.push_back(Src);
8763 return true;
8764 }
8765 }
8766
8767 return false;
8768}
8769
8770/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8771static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8772 SmallVectorImpl<int> &Mask) {
8773 int MaskWidth = Mask.size();
8774 SmallVector<SDValue, 16> UsedInputs;
8775 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8776 int lo = UsedInputs.size() * MaskWidth;
8777 int hi = lo + MaskWidth;
8778
8779 // Strip UNDEF input usage.
8780 if (Inputs[i].isUndef())
8781 for (int &M : Mask)
8782 if ((lo <= M) && (M < hi))
8783 M = SM_SentinelUndef;
8784
8785 // Check for unused inputs.
8786 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8787 for (int &M : Mask)
8788 if (lo <= M)
8789 M -= MaskWidth;
8790 continue;
8791 }
8792
8793 // Check for repeated inputs.
8794 bool IsRepeat = false;
8795 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8796 if (UsedInputs[j] != Inputs[i])
8797 continue;
8798 for (int &M : Mask)
8799 if (lo <= M)
8800 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8801 IsRepeat = true;
8802 break;
8803 }
8804 if (IsRepeat)
8805 continue;
8806
8807 UsedInputs.push_back(Inputs[i]);
8808 }
8809 Inputs = UsedInputs;
8810}
8811
8812/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8813/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8814/// Returns true if the target shuffle mask was decoded.
8815static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8816 SmallVectorImpl<SDValue> &Inputs,
8817 SmallVectorImpl<int> &Mask,
8818 APInt &KnownUndef, APInt &KnownZero,
8819 const SelectionDAG &DAG, unsigned Depth,
8820 bool ResolveKnownElts) {
8821 if (Depth >= SelectionDAG::MaxRecursionDepth)
8822 return false; // Limit search depth.
8823
8824 EVT VT = Op.getValueType();
8825 if (!VT.isSimple() || !VT.isVector())
8826 return false;
8827
8828 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8829 if (ResolveKnownElts)
8830 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8831 return true;
8832 }
8833 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8834 ResolveKnownElts)) {
8835 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8836 return true;
8837 }
8838 return false;
8839}
8840
8841static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8842 SmallVectorImpl<SDValue> &Inputs,
8843 SmallVectorImpl<int> &Mask,
8844 const SelectionDAG &DAG, unsigned Depth,
8845 bool ResolveKnownElts) {
8846 APInt KnownUndef, KnownZero;
8847 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8848 KnownZero, DAG, Depth, ResolveKnownElts);
8849}
8850
8851static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8852 SmallVectorImpl<int> &Mask,
8853 const SelectionDAG &DAG, unsigned Depth = 0,
8854 bool ResolveKnownElts = true) {
8855 EVT VT = Op.getValueType();
8856 if (!VT.isSimple() || !VT.isVector())
8857 return false;
8858
8859 unsigned NumElts = Op.getValueType().getVectorNumElements();
8860 APInt DemandedElts = APInt::getAllOnes(NumElts);
8861 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
8862 ResolveKnownElts);
8863}
8864
8865// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8866static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8867 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8868 SelectionDAG &DAG) {
8869 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))
8870 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))
8871 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))
;
8872
8873 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8874 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8875 return SDValue();
8876
8877 SDValue Ptr =
8878 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8879 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8880 SDValue Ops[] = {Mem->getChain(), Ptr};
8881 SDValue BcstLd = DAG.getMemIntrinsicNode(
8882 Opcode, DL, Tys, Ops, MemVT,
8883 DAG.getMachineFunction().getMachineMemOperand(
8884 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8885 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8886 return BcstLd;
8887}
8888
8889/// Returns the scalar element that will make up the i'th
8890/// element of the result of the vector shuffle.
8891static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8892 SelectionDAG &DAG, unsigned Depth) {
8893 if (Depth >= SelectionDAG::MaxRecursionDepth)
8894 return SDValue(); // Limit search depth.
8895
8896 EVT VT = Op.getValueType();
8897 unsigned Opcode = Op.getOpcode();
8898 unsigned NumElems = VT.getVectorNumElements();
8899
8900 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8901 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8902 int Elt = SV->getMaskElt(Index);
8903
8904 if (Elt < 0)
8905 return DAG.getUNDEF(VT.getVectorElementType());
8906
8907 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8908 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8909 }
8910
8911 // Recurse into target specific vector shuffles to find scalars.
8912 if (isTargetShuffle(Opcode)) {
8913 MVT ShufVT = VT.getSimpleVT();
8914 MVT ShufSVT = ShufVT.getVectorElementType();
8915 int NumElems = (int)ShufVT.getVectorNumElements();
8916 SmallVector<int, 16> ShuffleMask;
8917 SmallVector<SDValue, 16> ShuffleOps;
8918 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8919 ShuffleMask))
8920 return SDValue();
8921
8922 int Elt = ShuffleMask[Index];
8923 if (Elt == SM_SentinelZero)
8924 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8925 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8926 if (Elt == SM_SentinelUndef)
8927 return DAG.getUNDEF(ShufSVT);
8928
8929 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8929, __extension__
__PRETTY_FUNCTION__))
;
8930 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8931 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8932 }
8933
8934 // Recurse into insert_subvector base/sub vector to find scalars.
8935 if (Opcode == ISD::INSERT_SUBVECTOR) {
8936 SDValue Vec = Op.getOperand(0);
8937 SDValue Sub = Op.getOperand(1);
8938 uint64_t SubIdx = Op.getConstantOperandVal(2);
8939 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8940
8941 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8942 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8943 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8944 }
8945
8946 // Recurse into concat_vectors sub vector to find scalars.
8947 if (Opcode == ISD::CONCAT_VECTORS) {
8948 EVT SubVT = Op.getOperand(0).getValueType();
8949 unsigned NumSubElts = SubVT.getVectorNumElements();
8950 uint64_t SubIdx = Index / NumSubElts;
8951 uint64_t SubElt = Index % NumSubElts;
8952 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8953 }
8954
8955 // Recurse into extract_subvector src vector to find scalars.
8956 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8957 SDValue Src = Op.getOperand(0);
8958 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8959 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8960 }
8961
8962 // We only peek through bitcasts of the same vector width.
8963 if (Opcode == ISD::BITCAST) {
8964 SDValue Src = Op.getOperand(0);
8965 EVT SrcVT = Src.getValueType();
8966 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8967 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8968 return SDValue();
8969 }
8970
8971 // Actual nodes that may contain scalar elements
8972
8973 // For insert_vector_elt - either return the index matching scalar or recurse
8974 // into the base vector.
8975 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8976 isa<ConstantSDNode>(Op.getOperand(2))) {
8977 if (Op.getConstantOperandAPInt(2) == Index)
8978 return Op.getOperand(1);
8979 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8980 }
8981
8982 if (Opcode == ISD::SCALAR_TO_VECTOR)
8983 return (Index == 0) ? Op.getOperand(0)
8984 : DAG.getUNDEF(VT.getVectorElementType());
8985
8986 if (Opcode == ISD::BUILD_VECTOR)
8987 return Op.getOperand(Index);
8988
8989 return SDValue();
8990}
8991
8992// Use PINSRB/PINSRW/PINSRD to create a build vector.
8993static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8994 unsigned NumNonZero, unsigned NumZero,
8995 SelectionDAG &DAG,
8996 const X86Subtarget &Subtarget) {
8997 MVT VT = Op.getSimpleValueType();
8998 unsigned NumElts = VT.getVectorNumElements();
8999 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))
9000 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))
9001 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))
;
9002
9003 SDLoc dl(Op);
9004 SDValue V;
9005 bool First = true;
9006
9007 for (unsigned i = 0; i < NumElts; ++i) {
9008 bool IsNonZero = NonZeroMask[i];
9009 if (!IsNonZero)
9010 continue;
9011
9012 // If the build vector contains zeros or our first insertion is not the
9013 // first index then insert into zero vector to break any register
9014 // dependency else use SCALAR_TO_VECTOR.
9015 if (First) {
9016 First = false;
9017 if (NumZero || 0 != i)
9018 V = getZeroVector(VT, Subtarget, DAG, dl);
9019 else {
9020 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9020, __extension__
__PRETTY_FUNCTION__))
;
9021 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9022 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
9023 V = DAG.getBitcast(VT, V);
9024 continue;
9025 }
9026 }
9027 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
9028 DAG.getIntPtrConstant(i, dl));
9029 }
9030
9031 return V;
9032}
9033
9034/// Custom lower build_vector of v16i8.
9035static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
9036 unsigned NumNonZero, unsigned NumZero,
9037 SelectionDAG &DAG,
9038 const X86Subtarget &Subtarget) {
9039 if (NumNonZero > 8 && !Subtarget.hasSSE41())
9040 return SDValue();
9041
9042 // SSE4.1 - use PINSRB to insert each byte directly.
9043 if (Subtarget.hasSSE41())
9044 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9045 Subtarget);
9046
9047 SDLoc dl(Op);
9048 SDValue V;
9049
9050 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
9051 for (unsigned i = 0; i < 16; i += 2) {
9052 bool ThisIsNonZero = NonZeroMask[i];
9053 bool NextIsNonZero = NonZeroMask[i + 1];
9054 if (!ThisIsNonZero && !NextIsNonZero)
9055 continue;
9056
9057 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
9058 SDValue Elt;
9059 if (ThisIsNonZero) {
9060 if (NumZero || NextIsNonZero)
9061 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9062 else
9063 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9064 }
9065
9066 if (NextIsNonZero) {
9067 SDValue NextElt = Op.getOperand(i + 1);
9068 if (i == 0 && NumZero)
9069 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
9070 else
9071 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
9072 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
9073 DAG.getConstant(8, dl, MVT::i8));
9074 if (ThisIsNonZero)
9075 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
9076 else
9077 Elt = NextElt;
9078 }
9079
9080 // If our first insertion is not the first index or zeros are needed, then
9081 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
9082 // elements undefined).
9083 if (!V) {
9084 if (i != 0 || NumZero)
9085 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
9086 else {
9087 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
9088 V = DAG.getBitcast(MVT::v8i16, V);
9089 continue;
9090 }
9091 }
9092 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
9093 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
9094 DAG.getIntPtrConstant(i / 2, dl));
9095 }
9096
9097 return DAG.getBitcast(MVT::v16i8, V);
9098}
9099
9100/// Custom lower build_vector of v8i16.
9101static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
9102 unsigned NumNonZero, unsigned NumZero,
9103 SelectionDAG &DAG,
9104 const X86Subtarget &Subtarget) {
9105 if (NumNonZero > 4 && !Subtarget.hasSSE41())
9106 return SDValue();
9107
9108 // Use PINSRW to insert each byte directly.
9109 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9110 Subtarget);
9111}
9112
9113/// Custom lower build_vector of v4i32 or v4f32.
9114static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
9115 const X86Subtarget &Subtarget) {
9116 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
9117 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
9118 // Because we're creating a less complicated build vector here, we may enable
9119 // further folding of the MOVDDUP via shuffle transforms.
9120 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
9121 Op.getOperand(0) == Op.getOperand(2) &&
9122 Op.getOperand(1) == Op.getOperand(3) &&
9123 Op.getOperand(0) != Op.getOperand(1)) {
9124 SDLoc DL(Op);
9125 MVT VT = Op.getSimpleValueType();
9126 MVT EltVT = VT.getVectorElementType();
9127 // Create a new build vector with the first 2 elements followed by undef
9128 // padding, bitcast to v2f64, duplicate, and bitcast back.
9129 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9130 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9131 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
9132 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
9133 return DAG.getBitcast(VT, Dup);
9134 }
9135
9136 // Find all zeroable elements.
9137 std::bitset<4> Zeroable, Undefs;
9138 for (int i = 0; i < 4; ++i) {
9139 SDValue Elt = Op.getOperand(i);
9140 Undefs[i] = Elt.isUndef();
9141 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
9142 }
9143 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__
__PRETTY_FUNCTION__))
9144 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__
__PRETTY_FUNCTION__))
;
9145
9146 // We only know how to deal with build_vector nodes where elements are either
9147 // zeroable or extract_vector_elt with constant index.
9148 SDValue FirstNonZero;
9149 unsigned FirstNonZeroIdx;
9150 for (unsigned i = 0; i < 4; ++i) {
9151 if (Zeroable[i])
9152 continue;
9153 SDValue Elt = Op.getOperand(i);
9154 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9155 !isa<ConstantSDNode>(Elt.getOperand(1)))
9156 return SDValue();
9157 // Make sure that this node is extracting from a 128-bit vector.
9158 MVT VT = Elt.getOperand(0).getSimpleValueType();
9159 if (!VT.is128BitVector())
9160 return SDValue();
9161 if (!FirstNonZero.getNode()) {
9162 FirstNonZero = Elt;
9163 FirstNonZeroIdx = i;
9164 }
9165 }
9166
9167 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9167, __extension__
__PRETTY_FUNCTION__))
;
9168 SDValue V1 = FirstNonZero.getOperand(0);
9169 MVT VT = V1.getSimpleValueType();
9170
9171 // See if this build_vector can be lowered as a blend with zero.
9172 SDValue Elt;
9173 unsigned EltMaskIdx, EltIdx;
9174 int Mask[4];
9175 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
9176 if (Zeroable[EltIdx]) {
9177 // The zero vector will be on the right hand side.
9178 Mask[EltIdx] = EltIdx+4;
9179 continue;
9180 }
9181
9182 Elt = Op->getOperand(EltIdx);
9183 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
9184 EltMaskIdx = Elt.getConstantOperandVal(1);
9185 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
9186 break;
9187 Mask[EltIdx] = EltIdx;
9188 }
9189
9190 if (EltIdx == 4) {
9191 // Let the shuffle legalizer deal with blend operations.
9192 SDValue VZeroOrUndef = (Zeroable == Undefs)
9193 ? DAG.getUNDEF(VT)
9194 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
9195 if (V1.getSimpleValueType() != VT)
9196 V1 = DAG.getBitcast(VT, V1);
9197 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
9198 }
9199
9200 // See if we can lower this build_vector to a INSERTPS.
9201 if (!Subtarget.hasSSE41())
9202 return SDValue();
9203
9204 SDValue V2 = Elt.getOperand(0);
9205 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
9206 V1 = SDValue();
9207
9208 bool CanFold = true;
9209 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
9210 if (Zeroable[i])
9211 continue;
9212
9213 SDValue Current = Op->getOperand(i);
9214 SDValue SrcVector = Current->getOperand(0);
9215 if (!V1.getNode())
9216 V1 = SrcVector;
9217 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
9218 }
9219
9220 if (!CanFold)
9221 return SDValue();
9222
9223 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__
__PRETTY_FUNCTION__))
;
9224 if (V1.getSimpleValueType() != MVT::v4f32)
9225 V1 = DAG.getBitcast(MVT::v4f32, V1);
9226 if (V2.getSimpleValueType() != MVT::v4f32)
9227 V2 = DAG.getBitcast(MVT::v4f32, V2);
9228
9229 // Ok, we can emit an INSERTPS instruction.
9230 unsigned ZMask = Zeroable.to_ulong();
9231
9232 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
9233 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9233, __extension__
__PRETTY_FUNCTION__))
;
9234 SDLoc DL(Op);
9235 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9236 DAG.getIntPtrConstant(InsertPSMask, DL, true));
9237 return DAG.getBitcast(VT, Result);
9238}
9239
9240/// Return a vector logical shift node.
9241static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
9242 SelectionDAG &DAG, const TargetLowering &TLI,
9243 const SDLoc &dl) {
9244 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9244, __extension__
__PRETTY_FUNCTION__))
;
9245 MVT ShVT = MVT::v16i8;
9246 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
9247 SrcOp = DAG.getBitcast(ShVT, SrcOp);
9248 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9248, __extension__
__PRETTY_FUNCTION__))
;
9249 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
9250 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
9251}
9252
9253static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
9254 SelectionDAG &DAG) {
9255
9256 // Check if the scalar load can be widened into a vector load. And if
9257 // the address is "base + cst" see if the cst can be "absorbed" into
9258 // the shuffle mask.
9259 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
9260 SDValue Ptr = LD->getBasePtr();
9261 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
9262 return SDValue();
9263 EVT PVT = LD->getValueType(0);
9264 if (PVT != MVT::i32 && PVT != MVT::f32)
9265 return SDValue();
9266
9267 int FI = -1;
9268 int64_t Offset = 0;
9269 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
9270 FI = FINode->getIndex();
9271 Offset = 0;
9272 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
9273 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
9274 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
9275 Offset = Ptr.getConstantOperandVal(1);
9276 Ptr = Ptr.getOperand(0);
9277 } else {
9278 return SDValue();
9279 }
9280
9281 // FIXME: 256-bit vector instructions don't require a strict alignment,
9282 // improve this code to support it better.
9283 Align RequiredAlign(VT.getSizeInBits() / 8);
9284 SDValue Chain = LD->getChain();
9285 // Make sure the stack object alignment is at least 16 or 32.
9286 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9287 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
9288 if (!InferredAlign || *InferredAlign < RequiredAlign) {
9289 if (MFI.isFixedObjectIndex(FI)) {
9290 // Can't change the alignment. FIXME: It's possible to compute
9291 // the exact stack offset and reference FI + adjust offset instead.
9292 // If someone *really* cares about this. That's the way to implement it.
9293 return SDValue();
9294 } else {
9295 MFI.setObjectAlignment(FI, RequiredAlign);
9296 }
9297 }
9298
9299 // (Offset % 16 or 32) must be multiple of 4. Then address is then
9300 // Ptr + (Offset & ~15).
9301 if (Offset < 0)
9302 return SDValue();
9303 if ((Offset % RequiredAlign.value()) & 3)
9304 return SDValue();
9305 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
9306 if (StartOffset) {
9307 SDLoc DL(Ptr);
9308 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9309 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
9310 }
9311
9312 int EltNo = (Offset - StartOffset) >> 2;
9313 unsigned NumElems = VT.getVectorNumElements();
9314
9315 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
9316 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
9317 LD->getPointerInfo().getWithOffset(StartOffset));
9318
9319 SmallVector<int, 8> Mask(NumElems, EltNo);
9320
9321 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
9322 }
9323
9324 return SDValue();
9325}
9326
9327// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
9328static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
9329 if (ISD::isNON_EXTLoad(Elt.getNode())) {
9330 auto *BaseLd = cast<LoadSDNode>(Elt);
9331 if (!BaseLd->isSimple())
9332 return false;
9333 Ld = BaseLd;
9334 ByteOffset = 0;
9335 return true;
9336 }
9337
9338 switch (Elt.getOpcode()) {
9339 case ISD::BITCAST:
9340 case ISD::TRUNCATE:
9341 case ISD::SCALAR_TO_VECTOR:
9342 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
9343 case ISD::SRL:
9344 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9345 uint64_t Amt = AmtC->getZExtValue();
9346 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
9347 ByteOffset += Amt / 8;
9348 return true;
9349 }
9350 }
9351 break;
9352 case ISD::EXTRACT_VECTOR_ELT:
9353 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9354 SDValue Src = Elt.getOperand(0);
9355 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
9356 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
9357 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
9358 findEltLoadSrc(Src, Ld, ByteOffset)) {
9359 uint64_t Idx = IdxC->getZExtValue();
9360 ByteOffset += Idx * (SrcSizeInBits / 8);
9361 return true;
9362 }
9363 }
9364 break;
9365 }
9366
9367 return false;
9368}
9369
9370/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
9371/// elements can be replaced by a single large load which has the same value as
9372/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
9373///
9374/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
9375static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
9376 const SDLoc &DL, SelectionDAG &DAG,
9377 const X86Subtarget &Subtarget,
9378 bool IsAfterLegalize) {
9379 if ((VT.getScalarSizeInBits() % 8) != 0)
9380 return SDValue();
9381
9382 unsigned NumElems = Elts.size();
9383
9384 int LastLoadedElt = -1;
9385 APInt LoadMask = APInt::getZero(NumElems);
9386 APInt ZeroMask = APInt::getZero(NumElems);
9387 APInt UndefMask = APInt::getZero(NumElems);
9388
9389 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
9390 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
9391
9392 // For each element in the initializer, see if we've found a load, zero or an
9393 // undef.
9394 for (unsigned i = 0; i < NumElems; ++i) {
9395 SDValue Elt = peekThroughBitcasts(Elts[i]);
9396 if (!Elt.getNode())
9397 return SDValue();
9398 if (Elt.isUndef()) {
9399 UndefMask.setBit(i);
9400 continue;
9401 }
9402 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9403 ZeroMask.setBit(i);
9404 continue;
9405 }
9406
9407 // Each loaded element must be the correct fractional portion of the
9408 // requested vector load.
9409 unsigned EltSizeInBits = Elt.getValueSizeInBits();
9410 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9411 return SDValue();
9412
9413 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9414 return SDValue();
9415 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9416 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9417 return SDValue();
9418
9419 LoadMask.setBit(i);
9420 LastLoadedElt = i;
9421 }
9422 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
9423 NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
9424 "Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
;
9425
9426 // Handle Special Cases - all undef or undef/zero.
9427 if (UndefMask.popcount() == NumElems)
9428 return DAG.getUNDEF(VT);
9429 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
9430 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9431 : DAG.getConstantFP(0.0, DL, VT);
9432
9433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9434 int FirstLoadedElt = LoadMask.countr_zero();
9435 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9436 EVT EltBaseVT = EltBase.getValueType();
9437 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__
__PRETTY_FUNCTION__))
9438 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__
__PRETTY_FUNCTION__))
;
9439 LoadSDNode *LDBase = Loads[FirstLoadedElt];
9440 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9440, __extension__
__PRETTY_FUNCTION__))
;
9441 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9442 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9443 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9444 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9445 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9445, __extension__
__PRETTY_FUNCTION__))
;
9446
9447 // TODO: Support offsetting the base load.
9448 if (ByteOffsets[FirstLoadedElt] != 0)
9449 return SDValue();
9450
9451 // Check to see if the element's load is consecutive to the base load
9452 // or offset from a previous (already checked) load.
9453 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9454 LoadSDNode *Ld = Loads[EltIdx];
9455 int64_t ByteOffset = ByteOffsets[EltIdx];
9456 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9457 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9458 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9459 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9460 }
9461 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9462 EltIdx - FirstLoadedElt);
9463 };
9464
9465 // Consecutive loads can contain UNDEFS but not ZERO elements.
9466 // Consecutive loads with UNDEFs and ZEROs elements require a
9467 // an additional shuffle stage to clear the ZERO elements.
9468 bool IsConsecutiveLoad = true;
9469 bool IsConsecutiveLoadWithZeros = true;
9470 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9471 if (LoadMask[i]) {
9472 if (!CheckConsecutiveLoad(LDBase, i)) {
9473 IsConsecutiveLoad = false;
9474 IsConsecutiveLoadWithZeros = false;
9475 break;
9476 }
9477 } else if (ZeroMask[i]) {
9478 IsConsecutiveLoad = false;
9479 }
9480 }
9481
9482 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9483 auto MMOFlags = LDBase->getMemOperand()->getFlags();
9484 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__
__PRETTY_FUNCTION__))
9485 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__
__PRETTY_FUNCTION__))
;
9486 SDValue NewLd =
9487 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9488 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9489 MMOFlags);
9490 for (auto *LD : Loads)
9491 if (LD)
9492 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9493 return NewLd;
9494 };
9495
9496 // Check if the base load is entirely dereferenceable.
9497 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9498 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9499
9500 // LOAD - all consecutive load/undefs (must start/end with a load or be
9501 // entirely dereferenceable). If we have found an entire vector of loads and
9502 // undefs, then return a large load of the entire vector width starting at the
9503 // base pointer. If the vector contains zeros, then attempt to shuffle those
9504 // elements.
9505 if (FirstLoadedElt == 0 &&
9506 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9507 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9508 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9509 return SDValue();
9510
9511 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9512 // will lower to regular temporal loads and use the cache.
9513 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
9514 VT.is256BitVector() && !Subtarget.hasInt256())
9515 return SDValue();
9516
9517 if (NumElems == 1)
9518 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9519
9520 if (!ZeroMask)
9521 return CreateLoad(VT, LDBase);
9522
9523 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9524 // vector and a zero vector to clear out the zero elements.
9525 if (!IsAfterLegalize && VT.isVector()) {
9526 unsigned NumMaskElts = VT.getVectorNumElements();
9527 if ((NumMaskElts % NumElems) == 0) {
9528 unsigned Scale = NumMaskElts / NumElems;
9529 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9530 for (unsigned i = 0; i < NumElems; ++i) {
9531 if (UndefMask[i])
9532 continue;
9533 int Offset = ZeroMask[i] ? NumMaskElts : 0;
9534 for (unsigned j = 0; j != Scale; ++j)
9535 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9536 }
9537 SDValue V = CreateLoad(VT, LDBase);
9538 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9539 : DAG.getConstantFP(0.0, DL, VT);
9540 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9541 }
9542 }
9543 }
9544
9545 // If the upper half of a ymm/zmm load is undef then just load the lower half.
9546 if (VT.is256BitVector() || VT.is512BitVector()) {
9547 unsigned HalfNumElems = NumElems / 2;
9548 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9549 EVT HalfVT =
9550 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9551 SDValue HalfLD =
9552 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9553 DAG, Subtarget, IsAfterLegalize);
9554 if (HalfLD)
9555 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9556 HalfLD, DAG.getIntPtrConstant(0, DL));
9557 }
9558 }
9559
9560 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9561 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9562 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9563 LoadSizeInBits == 64) &&
9564 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9565 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9566 : MVT::getIntegerVT(LoadSizeInBits);
9567 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9568 // Allow v4f32 on SSE1 only targets.
9569 // FIXME: Add more isel patterns so we can just use VT directly.
9570 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9571 VecVT = MVT::v4f32;
9572 if (TLI.isTypeLegal(VecVT)) {
9573 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9574 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9575 SDValue ResNode = DAG.getMemIntrinsicNode(
9576 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9577 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9578 for (auto *LD : Loads)
9579 if (LD)
9580 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9581 return DAG.getBitcast(VT, ResNode);
9582 }
9583 }
9584
9585 // BROADCAST - match the smallest possible repetition pattern, load that
9586 // scalar/subvector element and then broadcast to the entire vector.
9587 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9588 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9589 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9590 unsigned RepeatSize = SubElems * BaseSizeInBits;
9591 unsigned ScalarSize = std::min(RepeatSize, 64u);
9592 if (!Subtarget.hasAVX2() && ScalarSize < 32)
9593 continue;
9594
9595 // Don't attempt a 1:N subvector broadcast - it should be caught by
9596 // combineConcatVectorOps, else will cause infinite loops.
9597 if (RepeatSize > ScalarSize && SubElems == 1)
9598 continue;
9599
9600 bool Match = true;
9601 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9602 for (unsigned i = 0; i != NumElems && Match; ++i) {
9603 if (!LoadMask[i])
9604 continue;
9605 SDValue Elt = peekThroughBitcasts(Elts[i]);
9606 if (RepeatedLoads[i % SubElems].isUndef())
9607 RepeatedLoads[i % SubElems] = Elt;
9608 else
9609 Match &= (RepeatedLoads[i % SubElems] == Elt);
9610 }
9611
9612 // We must have loads at both ends of the repetition.
9613 Match &= !RepeatedLoads.front().isUndef();
9614 Match &= !RepeatedLoads.back().isUndef();
9615 if (!Match)
9616 continue;
9617
9618 EVT RepeatVT =
9619 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9620 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9621 : EVT::getFloatingPointVT(ScalarSize);
9622 if (RepeatSize > ScalarSize)
9623 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9624 RepeatSize / ScalarSize);
9625 EVT BroadcastVT =
9626 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9627 VT.getSizeInBits() / ScalarSize);
9628 if (TLI.isTypeLegal(BroadcastVT)) {
9629 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9630 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9631 SDValue Broadcast = RepeatLoad;
9632 if (RepeatSize > ScalarSize) {
9633 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9634 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9635 } else {
9636 if (!Subtarget.hasAVX2() &&
9637 !X86::mayFoldLoadIntoBroadcastFromMem(
9638 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9639 Subtarget,
9640 /*AssumeSingleUse=*/true))
9641 return SDValue();
9642 Broadcast =
9643 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9644 }
9645 return DAG.getBitcast(VT, Broadcast);
9646 }
9647 }
9648 }
9649 }
9650
9651 return SDValue();
9652}
9653
9654// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9655// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9656// are consecutive, non-overlapping, and in the right order.
9657static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9658 SelectionDAG &DAG,
9659 const X86Subtarget &Subtarget,
9660 bool IsAfterLegalize) {
9661 SmallVector<SDValue, 64> Elts;
9662 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9663 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9664 Elts.push_back(Elt);
9665 continue;
9666 }
9667 return SDValue();
9668 }
9669 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9669, __extension__
__PRETTY_FUNCTION__))
;
9670 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9671 IsAfterLegalize);
9672}
9673
9674static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9675 unsigned SplatBitSize, LLVMContext &C) {
9676 unsigned ScalarSize = VT.getScalarSizeInBits();
9677 unsigned NumElm = SplatBitSize / ScalarSize;
9678
9679 SmallVector<Constant *, 32> ConstantVec;
9680 for (unsigned i = 0; i < NumElm; i++) {
9681 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9682 Constant *Const;
9683 if (VT.isFloatingPoint()) {
9684 if (ScalarSize == 16) {
9685 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9686 } else if (ScalarSize == 32) {
9687 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9688 } else {
9689 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__
__PRETTY_FUNCTION__))
;
9690 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9691 }
9692 } else
9693 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9694 ConstantVec.push_back(Const);
9695 }
9696 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9697}
9698
9699static bool isFoldableUseOfShuffle(SDNode *N) {
9700 for (auto *U : N->uses()) {
9701 unsigned Opc = U->getOpcode();
9702 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9703 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9704 return false;
9705 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9706 return false;
9707 if (isTargetShuffle(Opc))
9708 return true;
9709 if (Opc == ISD::BITCAST) // Ignore bitcasts
9710 return isFoldableUseOfShuffle(U);
9711 if (N->hasOneUse()) {
9712 // TODO, there may be some general way to know if a SDNode can
9713 // be folded. We now only know whether an MI is foldable.
9714 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9715 return false;
9716 return true;
9717 }
9718 }
9719 return false;
9720}
9721
9722/// Attempt to use the vbroadcast instruction to generate a splat value
9723/// from a splat BUILD_VECTOR which uses:
9724/// a. A single scalar load, or a constant.
9725/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9726///
9727/// The VBROADCAST node is returned when a pattern is found,
9728/// or SDValue() otherwise.
9729static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9730 const X86Subtarget &Subtarget,
9731 SelectionDAG &DAG) {
9732 // VBROADCAST requires AVX.
9733 // TODO: Splats could be generated for non-AVX CPUs using SSE
9734 // instructions, but there's less potential gain for only 128-bit vectors.
9735 if (!Subtarget.hasAVX())
9736 return SDValue();
9737
9738 MVT VT = BVOp->getSimpleValueType(0);
9739 unsigned NumElts = VT.getVectorNumElements();
9740 SDLoc dl(BVOp);
9741
9742 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__
__PRETTY_FUNCTION__))
9743 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__
__PRETTY_FUNCTION__))
;
9744
9745 // See if the build vector is a repeating sequence of scalars (inc. splat).
9746 SDValue Ld;
9747 BitVector UndefElements;
9748 SmallVector<SDValue, 16> Sequence;
9749 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9750 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9750, __extension__
__PRETTY_FUNCTION__))
;
9751 if (Sequence.size() == 1)
9752 Ld = Sequence[0];
9753 }
9754
9755 // Attempt to use VBROADCASTM
9756 // From this pattern:
9757 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9758 // b. t1 = (build_vector t0 t0)
9759 //
9760 // Create (VBROADCASTM v2i1 X)
9761 if (!Sequence.empty() && Subtarget.hasCDI()) {
9762 // If not a splat, are the upper sequence values zeroable?
9763 unsigned SeqLen = Sequence.size();
9764 bool UpperZeroOrUndef =
9765 SeqLen == 1 ||
9766 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
9767 return !V || V.isUndef() || isNullConstant(V);
9768 });
9769 SDValue Op0 = Sequence[0];
9770 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9771 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9772 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9773 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9774 ? Op0.getOperand(0)
9775 : Op0.getOperand(0).getOperand(0);
9776 MVT MaskVT = BOperand.getSimpleValueType();
9777 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9778 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9779 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9780 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9781 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9782 unsigned Scale = 512 / VT.getSizeInBits();
9783 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9784 }
9785 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9786 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9787 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9788 return DAG.getBitcast(VT, Bcst);
9789 }
9790 }
9791 }
9792
9793 unsigned NumUndefElts = UndefElements.count();
9794 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9795 APInt SplatValue, Undef;
9796 unsigned SplatBitSize;
9797 bool HasUndef;
9798 // Check if this is a repeated constant pattern suitable for broadcasting.
9799 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9800 SplatBitSize > VT.getScalarSizeInBits() &&
9801 SplatBitSize < VT.getSizeInBits()) {
9802 // Avoid replacing with broadcast when it's a use of a shuffle
9803 // instruction to preserve the present custom lowering of shuffles.
9804 if (isFoldableUseOfShuffle(BVOp))
9805 return SDValue();
9806 // replace BUILD_VECTOR with broadcast of the repeated constants.
9807 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9808 LLVMContext *Ctx = DAG.getContext();
9809 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9810 if (Subtarget.hasAVX()) {
9811 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9812 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9813 // Splatted value can fit in one INTEGER constant in constant pool.
9814 // Load the constant and broadcast it.
9815 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9816 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9817 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9818 SDValue CP = DAG.getConstantPool(C, PVT);
9819 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9820
9821 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9822 SDVTList Tys =
9823 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9824 SDValue Ops[] = {DAG.getEntryNode(), CP};
9825 MachinePointerInfo MPI =
9826 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9827 SDValue Brdcst = DAG.getMemIntrinsicNode(
9828 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9829 MachineMemOperand::MOLoad);
9830 return DAG.getBitcast(VT, Brdcst);
9831 }
9832 if (SplatBitSize > 64) {
9833 // Load the vector of constants and broadcast it.
9834 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9835 *Ctx);
9836 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9837 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9838 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9839 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9840 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9841 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9842 MachinePointerInfo MPI =
9843 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9844 return DAG.getMemIntrinsicNode(
9845 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9846 MachineMemOperand::MOLoad);
9847 }
9848 }
9849 }
9850
9851 // If we are moving a scalar into a vector (Ld must be set and all elements
9852 // but 1 are undef) and that operation is not obviously supported by
9853 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9854 // That's better than general shuffling and may eliminate a load to GPR and
9855 // move from scalar to vector register.
9856 if (!Ld || NumElts - NumUndefElts != 1)
9857 return SDValue();
9858 unsigned ScalarSize = Ld.getValueSizeInBits();
9859 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9860 return SDValue();
9861 }
9862
9863 bool ConstSplatVal =
9864 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9865 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9866
9867 // TODO: Handle broadcasts of non-constant sequences.
9868
9869 // Make sure that all of the users of a non-constant load are from the
9870 // BUILD_VECTOR node.
9871 // FIXME: Is the use count needed for non-constant, non-load case?
9872 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9873 return SDValue();
9874
9875 unsigned ScalarSize = Ld.getValueSizeInBits();
9876 bool IsGE256 = (VT.getSizeInBits() >= 256);
9877
9878 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9879 // instruction to save 8 or more bytes of constant pool data.
9880 // TODO: If multiple splats are generated to load the same constant,
9881 // it may be detrimental to overall size. There needs to be a way to detect
9882 // that condition to know if this is truly a size win.
9883 bool OptForSize = DAG.shouldOptForSize();
9884
9885 // Handle broadcasting a single constant scalar from the constant pool
9886 // into a vector.
9887 // On Sandybridge (no AVX2), it is still better to load a constant vector
9888 // from the constant pool and not to broadcast it from a scalar.
9889 // But override that restriction when optimizing for size.
9890 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9891 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9892 EVT CVT = Ld.getValueType();
9893 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9893, __extension__
__PRETTY_FUNCTION__))
;
9894
9895 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
9896 // For size optimization, also splat v2f64 and v2i64, and for size opt
9897 // with AVX2, also splat i8 and i16.
9898 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9899 if (ScalarSize == 32 ||
9900 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9901 CVT == MVT::f16 ||
9902 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9903 const Constant *C = nullptr;
9904 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9905 C = CI->getConstantIntValue();
9906 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9907 C = CF->getConstantFPValue();
9908
9909 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9909, __extension__
__PRETTY_FUNCTION__))
;
9910
9911 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9912 SDValue CP =
9913 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9914 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9915
9916 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9917 SDValue Ops[] = {DAG.getEntryNode(), CP};
9918 MachinePointerInfo MPI =
9919 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9920 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9921 MPI, Alignment, MachineMemOperand::MOLoad);
9922 }
9923 }
9924
9925 // Handle AVX2 in-register broadcasts.
9926 if (!IsLoad && Subtarget.hasInt256() &&
9927 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9928 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9929
9930 // The scalar source must be a normal load.
9931 if (!IsLoad)
9932 return SDValue();
9933
9934 // Make sure the non-chain result is only used by this build vector.
9935 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9936 return SDValue();
9937
9938 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9939 (Subtarget.hasVLX() && ScalarSize == 64)) {
9940 auto *LN = cast<LoadSDNode>(Ld);
9941 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9942 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9943 SDValue BCast =
9944 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9945 LN->getMemoryVT(), LN->getMemOperand());
9946 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9947 return BCast;
9948 }
9949
9950 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9951 // double since there is no vbroadcastsd xmm
9952 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9953 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9954 auto *LN = cast<LoadSDNode>(Ld);
9955 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9956 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9957 SDValue BCast =
9958 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9959 LN->getMemoryVT(), LN->getMemOperand());
9960 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9961 return BCast;
9962 }
9963
9964 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9965 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9966
9967 // Unsupported broadcast.
9968 return SDValue();
9969}
9970
9971/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9972/// underlying vector and index.
9973///
9974/// Modifies \p ExtractedFromVec to the real vector and returns the real
9975/// index.
9976static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9977 SDValue ExtIdx) {
9978 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9979 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9980 return Idx;
9981
9982 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9983 // lowered this:
9984 // (extract_vector_elt (v8f32 %1), Constant<6>)
9985 // to:
9986 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9987 // (extract_subvector (v8f32 %0), Constant<4>),
9988 // undef)
9989 // Constant<0>)
9990 // In this case the vector is the extract_subvector expression and the index
9991 // is 2, as specified by the shuffle.
9992 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9993 SDValue ShuffleVec = SVOp->getOperand(0);
9994 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9995 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__
__PRETTY_FUNCTION__))
9996 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__
__PRETTY_FUNCTION__))
;
9997
9998 int ShuffleIdx = SVOp->getMaskElt(Idx);
9999 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
10000 ExtractedFromVec = ShuffleVec;
10001 return ShuffleIdx;
10002 }
10003 return Idx;
10004}
10005
10006static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
10007 MVT VT = Op.getSimpleValueType();
10008
10009 // Skip if insert_vec_elt is not supported.
10010 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10011 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
10012 return SDValue();
10013
10014 SDLoc DL(Op);
10015 unsigned NumElems = Op.getNumOperands();
10016
10017 SDValue VecIn1;
10018 SDValue VecIn2;
10019 SmallVector<unsigned, 4> InsertIndices;
10020 SmallVector<int, 8> Mask(NumElems, -1);
10021
10022 for (unsigned i = 0; i != NumElems; ++i) {
10023 unsigned Opc = Op.getOperand(i).getOpcode();
10024
10025 if (Opc == ISD::UNDEF)
10026 continue;
10027
10028 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
10029 // Quit if more than 1 elements need inserting.
10030 if (InsertIndices.size() > 1)
10031 return SDValue();
10032
10033 InsertIndices.push_back(i);
10034 continue;
10035 }
10036
10037 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
10038 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
10039
10040 // Quit if non-constant index.
10041 if (!isa<ConstantSDNode>(ExtIdx))
10042 return SDValue();
10043 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
10044
10045 // Quit if extracted from vector of different type.
10046 if (ExtractedFromVec.getValueType() != VT)
10047 return SDValue();
10048
10049 if (!VecIn1.getNode())
10050 VecIn1 = ExtractedFromVec;
10051 else if (VecIn1 != ExtractedFromVec) {
10052 if (!VecIn2.getNode())
10053 VecIn2 = ExtractedFromVec;
10054 else if (VecIn2 != ExtractedFromVec)
10055 // Quit if more than 2 vectors to shuffle
10056 return SDValue();
10057 }
10058
10059 if (ExtractedFromVec == VecIn1)
10060 Mask[i] = Idx;
10061 else if (ExtractedFromVec == VecIn2)
10062 Mask[i] = Idx + NumElems;
10063 }
10064
10065 if (!VecIn1.getNode())
10066 return SDValue();
10067
10068 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
10069 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
10070
10071 for (unsigned Idx : InsertIndices)
10072 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
10073 DAG.getIntPtrConstant(Idx, DL));
10074
10075 return NV;
10076}
10077
10078// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
10079static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
10080 const X86Subtarget &Subtarget) {
10081 MVT VT = Op.getSimpleValueType();
10082 MVT IVT = VT.changeVectorElementTypeToInteger();
10083 SmallVector<SDValue, 16> NewOps;
10084 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
10085 NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
10086 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
10087 return DAG.getBitcast(VT, Res);
10088}
10089
10090// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
10091static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
10092 const X86Subtarget &Subtarget) {
10093
10094 MVT VT = Op.getSimpleValueType();
10095 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__
__PRETTY_FUNCTION__))
10096 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__
__PRETTY_FUNCTION__))
;
10097
10098 SDLoc dl(Op);
10099 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
10100 ISD::isBuildVectorAllOnes(Op.getNode()))
10101 return Op;
10102
10103 uint64_t Immediate = 0;
10104 SmallVector<unsigned, 16> NonConstIdx;
10105 bool IsSplat = true;
10106 bool HasConstElts = false;
10107 int SplatIdx = -1;
10108 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
10109 SDValue In = Op.getOperand(idx);
10110 if (In.isUndef())
10111 continue;
10112 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
10113 Immediate |= (InC->getZExtValue() & 0x1) << idx;
10114 HasConstElts = true;
10115 } else {
10116 NonConstIdx.push_back(idx);
10117 }
10118 if (SplatIdx < 0)
10119 SplatIdx = idx;
10120 else if (In != Op.getOperand(SplatIdx))
10121 IsSplat = false;
10122 }
10123
10124 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
10125 if (IsSplat) {
10126 // The build_vector allows the scalar element to be larger than the vector
10127 // element type. We need to mask it to use as a condition unless we know
10128 // the upper bits are zero.
10129 // FIXME: Use computeKnownBits instead of checking specific opcode?
10130 SDValue Cond = Op.getOperand(SplatIdx);
10131 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10131, __extension__
__PRETTY_FUNCTION__))
;
10132 if (Cond.getOpcode() != ISD::SETCC)
10133 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
10134 DAG.getConstant(1, dl, MVT::i8));
10135
10136 // Perform the select in the scalar domain so we can use cmov.
10137 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10138 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
10139 DAG.getAllOnesConstant(dl, MVT::i32),
10140 DAG.getConstant(0, dl, MVT::i32));
10141 Select = DAG.getBitcast(MVT::v32i1, Select);
10142 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
10143 } else {
10144 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10145 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
10146 DAG.getAllOnesConstant(dl, ImmVT),
10147 DAG.getConstant(0, dl, ImmVT));
10148 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10149 Select = DAG.getBitcast(VecVT, Select);
10150 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
10151 DAG.getIntPtrConstant(0, dl));
10152 }
10153 }
10154
10155 // insert elements one by one
10156 SDValue DstVec;
10157 if (HasConstElts) {
10158 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10159 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
10160 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
10161 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
10162 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
10163 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
10164 } else {
10165 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10166 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
10167 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10168 DstVec = DAG.getBitcast(VecVT, Imm);
10169 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
10170 DAG.getIntPtrConstant(0, dl));
10171 }
10172 } else
10173 DstVec = DAG.getUNDEF(VT);
10174
10175 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
10176 unsigned InsertIdx = NonConstIdx[i];
10177 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
10178 Op.getOperand(InsertIdx),
10179 DAG.getIntPtrConstant(InsertIdx, dl));
10180 }
10181 return DstVec;
10182}
10183
10184LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
10185 switch (Opcode) {
10186 case X86ISD::PACKSS:
10187 case X86ISD::PACKUS:
10188 case X86ISD::FHADD:
10189 case X86ISD::FHSUB:
10190 case X86ISD::HADD:
10191 case X86ISD::HSUB:
10192 return true;
10193 }
10194 return false;
10195}
10196
10197/// This is a helper function of LowerToHorizontalOp().
10198/// This function checks that the build_vector \p N in input implements a
10199/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
10200/// may not match the layout of an x86 256-bit horizontal instruction.
10201/// In other words, if this returns true, then some extraction/insertion will
10202/// be required to produce a valid horizontal instruction.
10203///
10204/// Parameter \p Opcode defines the kind of horizontal operation to match.
10205/// For example, if \p Opcode is equal to ISD::ADD, then this function
10206/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
10207/// is equal to ISD::SUB, then this function checks if this is a horizontal
10208/// arithmetic sub.
10209///
10210/// This function only analyzes elements of \p N whose indices are
10211/// in range [BaseIdx, LastIdx).
10212///
10213/// TODO: This function was originally used to match both real and fake partial
10214/// horizontal operations, but the index-matching logic is incorrect for that.
10215/// See the corrected implementation in isHopBuildVector(). Can we reduce this
10216/// code because it is only used for partial h-op matching now?
10217static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
10218 SelectionDAG &DAG,
10219 unsigned BaseIdx, unsigned LastIdx,
10220 SDValue &V0, SDValue &V1) {
10221 EVT VT = N->getValueType(0);
10222 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10222, __extension__
__PRETTY_FUNCTION__))
;
10223 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10223, __extension__
__PRETTY_FUNCTION__))
;
10224 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__
__PRETTY_FUNCTION__))
10225 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__
__PRETTY_FUNCTION__))
;
10226
10227 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
10228 bool CanFold = true;
10229 unsigned ExpectedVExtractIdx = BaseIdx;
10230 unsigned NumElts = LastIdx - BaseIdx;
10231 V0 = DAG.getUNDEF(VT);
10232 V1 = DAG.getUNDEF(VT);
10233
10234 // Check if N implements a horizontal binop.
10235 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
10236 SDValue Op = N->getOperand(i + BaseIdx);
10237
10238 // Skip UNDEFs.
10239 if (Op->isUndef()) {
10240 // Update the expected vector extract index.
10241 if (i * 2 == NumElts)
10242 ExpectedVExtractIdx = BaseIdx;
10243 ExpectedVExtractIdx += 2;
10244 continue;
10245 }
10246
10247 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
10248
10249 if (!CanFold)
10250 break;
10251
10252 SDValue Op0 = Op.getOperand(0);
10253 SDValue Op1 = Op.getOperand(1);
10254
10255 // Try to match the following pattern:
10256 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
10257 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10258 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10259 Op0.getOperand(0) == Op1.getOperand(0) &&
10260 isa<ConstantSDNode>(Op0.getOperand(1)) &&
10261 isa<ConstantSDNode>(Op1.getOperand(1)));
10262 if (!CanFold)
10263 break;
10264
10265 unsigned I0 = Op0.getConstantOperandVal(1);
10266 unsigned I1 = Op1.getConstantOperandVal(1);
10267
10268 if (i * 2 < NumElts) {
10269 if (V0.isUndef()) {
10270 V0 = Op0.getOperand(0);
10271 if (V0.getValueType() != VT)
10272 return false;
10273 }
10274 } else {
10275 if (V1.isUndef()) {
10276 V1 = Op0.getOperand(0);
10277 if (V1.getValueType() != VT)
10278 return false;
10279 }
10280 if (i * 2 == NumElts)
10281 ExpectedVExtractIdx = BaseIdx;
10282 }
10283
10284 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
10285 if (I0 == ExpectedVExtractIdx)
10286 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
10287 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
10288 // Try to match the following dag sequence:
10289 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
10290 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
10291 } else
10292 CanFold = false;
10293
10294 ExpectedVExtractIdx += 2;
10295 }
10296
10297 return CanFold;
10298}
10299
10300/// Emit a sequence of two 128-bit horizontal add/sub followed by
10301/// a concat_vector.
10302///
10303/// This is a helper function of LowerToHorizontalOp().
10304/// This function expects two 256-bit vectors called V0 and V1.
10305/// At first, each vector is split into two separate 128-bit vectors.
10306/// Then, the resulting 128-bit vectors are used to implement two
10307/// horizontal binary operations.
10308///
10309/// The kind of horizontal binary operation is defined by \p X86Opcode.
10310///
10311/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
10312/// the two new horizontal binop.
10313/// When Mode is set, the first horizontal binop dag node would take as input
10314/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
10315/// horizontal binop dag node would take as input the lower 128-bit of V1
10316/// and the upper 128-bit of V1.
10317/// Example:
10318/// HADD V0_LO, V0_HI
10319/// HADD V1_LO, V1_HI
10320///
10321/// Otherwise, the first horizontal binop dag node takes as input the lower
10322/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
10323/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
10324/// Example:
10325/// HADD V0_LO, V1_LO
10326/// HADD V0_HI, V1_HI
10327///
10328/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
10329/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
10330/// the upper 128-bits of the result.
10331static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
10332 const SDLoc &DL, SelectionDAG &DAG,
10333 unsigned X86Opcode, bool Mode,
10334 bool isUndefLO, bool isUndefHI) {
10335 MVT VT = V0.getSimpleValueType();
10336 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__
__PRETTY_FUNCTION__))
10337 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__
__PRETTY_FUNCTION__))
;
10338
10339 unsigned NumElts = VT.getVectorNumElements();
10340 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
10341 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
10342 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
10343 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
10344 MVT NewVT = V0_LO.getSimpleValueType();
10345
10346 SDValue LO = DAG.getUNDEF(NewVT);
10347 SDValue HI = DAG.getUNDEF(NewVT);
10348
10349 if (Mode) {
10350 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10351 if (!isUndefLO && !V0->isUndef())
10352 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
10353 if (!isUndefHI && !V1->isUndef())
10354 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
10355 } else {
10356 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10357 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
10358 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
10359
10360 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
10361 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
10362 }
10363
10364 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
10365}
10366
10367/// Returns true iff \p BV builds a vector with the result equivalent to
10368/// the result of ADDSUB/SUBADD operation.
10369/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
10370/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
10371/// \p Opnd0 and \p Opnd1.
10372static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
10373 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10374 SDValue &Opnd0, SDValue &Opnd1,
10375 unsigned &NumExtracts,
10376 bool &IsSubAdd) {
10377
10378 MVT VT = BV->getSimpleValueType(0);
10379 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
10380 return false;
10381
10382 unsigned NumElts = VT.getVectorNumElements();
10383 SDValue InVec0 = DAG.getUNDEF(VT);
10384 SDValue InVec1 = DAG.getUNDEF(VT);
10385
10386 NumExtracts = 0;
10387
10388 // Odd-numbered elements in the input build vector are obtained from
10389 // adding/subtracting two integer/float elements.
10390 // Even-numbered elements in the input build vector are obtained from
10391 // subtracting/adding two integer/float elements.
10392 unsigned Opc[2] = {0, 0};
10393 for (unsigned i = 0, e = NumElts; i != e; ++i) {
10394 SDValue Op = BV->getOperand(i);
10395
10396 // Skip 'undef' values.
10397 unsigned Opcode = Op.getOpcode();
10398 if (Opcode == ISD::UNDEF)
10399 continue;
10400
10401 // Early exit if we found an unexpected opcode.
10402 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
10403 return false;
10404
10405 SDValue Op0 = Op.getOperand(0);
10406 SDValue Op1 = Op.getOperand(1);
10407
10408 // Try to match the following pattern:
10409 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10410 // Early exit if we cannot match that sequence.
10411 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10412 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10413 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10414 Op0.getOperand(1) != Op1.getOperand(1))
10415 return false;
10416
10417 unsigned I0 = Op0.getConstantOperandVal(1);
10418 if (I0 != i)
10419 return false;
10420
10421 // We found a valid add/sub node, make sure its the same opcode as previous
10422 // elements for this parity.
10423 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10424 return false;
10425 Opc[i % 2] = Opcode;
10426
10427 // Update InVec0 and InVec1.
10428 if (InVec0.isUndef()) {
10429 InVec0 = Op0.getOperand(0);
10430 if (InVec0.getSimpleValueType() != VT)
10431 return false;
10432 }
10433 if (InVec1.isUndef()) {
10434 InVec1 = Op1.getOperand(0);
10435 if (InVec1.getSimpleValueType() != VT)
10436 return false;
10437 }
10438
10439 // Make sure that operands in input to each add/sub node always
10440 // come from a same pair of vectors.
10441 if (InVec0 != Op0.getOperand(0)) {
10442 if (Opcode == ISD::FSUB)
10443 return false;
10444
10445 // FADD is commutable. Try to commute the operands
10446 // and then test again.
10447 std::swap(Op0, Op1);
10448 if (InVec0 != Op0.getOperand(0))
10449 return false;
10450 }
10451
10452 if (InVec1 != Op1.getOperand(0))
10453 return false;
10454
10455 // Increment the number of extractions done.
10456 ++NumExtracts;
10457 }
10458
10459 // Ensure we have found an opcode for both parities and that they are
10460 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10461 // inputs are undef.
10462 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10463 InVec0.isUndef() || InVec1.isUndef())
10464 return false;
10465
10466 IsSubAdd = Opc[0] == ISD::FADD;
10467
10468 Opnd0 = InVec0;
10469 Opnd1 = InVec1;
10470 return true;
10471}
10472
10473/// Returns true if is possible to fold MUL and an idiom that has already been
10474/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10475/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10476/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10477///
10478/// Prior to calling this function it should be known that there is some
10479/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10480/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10481/// before replacement of such SDNode with ADDSUB operation. Thus the number
10482/// of \p Opnd0 uses is expected to be equal to 2.
10483/// For example, this function may be called for the following IR:
10484/// %AB = fmul fast <2 x double> %A, %B
10485/// %Sub = fsub fast <2 x double> %AB, %C
10486/// %Add = fadd fast <2 x double> %AB, %C
10487/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10488/// <2 x i32> <i32 0, i32 3>
10489/// There is a def for %Addsub here, which potentially can be replaced by
10490/// X86ISD::ADDSUB operation:
10491/// %Addsub = X86ISD::ADDSUB %AB, %C
10492/// and such ADDSUB can further be replaced with FMADDSUB:
10493/// %Addsub = FMADDSUB %A, %B, %C.
10494///
10495/// The main reason why this method is called before the replacement of the
10496/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10497/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10498/// FMADDSUB is.
10499static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10500 SelectionDAG &DAG,
10501 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10502 unsigned ExpectedUses) {
10503 if (Opnd0.getOpcode() != ISD::FMUL ||
10504 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10505 return false;
10506
10507 // FIXME: These checks must match the similar ones in
10508 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10509 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10510 // or MUL + ADDSUB to FMADDSUB.
10511 const TargetOptions &Options = DAG.getTarget().Options;
10512 bool AllowFusion =
10513 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10514 if (!AllowFusion)
10515 return false;
10516
10517 Opnd2 = Opnd1;
10518 Opnd1 = Opnd0.getOperand(1);
10519 Opnd0 = Opnd0.getOperand(0);
10520
10521 return true;
10522}
10523
10524/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10525/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10526/// X86ISD::FMSUBADD node.
10527static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10528 const X86Subtarget &Subtarget,
10529 SelectionDAG &DAG) {
10530 SDValue Opnd0, Opnd1;
10531 unsigned NumExtracts;
10532 bool IsSubAdd;
10533 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10534 IsSubAdd))
10535 return SDValue();
10536
10537 MVT VT = BV->getSimpleValueType(0);
10538 SDLoc DL(BV);
10539
10540 // Try to generate X86ISD::FMADDSUB node here.
10541 SDValue Opnd2;
10542 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10543 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10544 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10545 }
10546
10547 // We only support ADDSUB.
10548 if (IsSubAdd)
10549 return SDValue();
10550
10551 // There are no known X86 targets with 512-bit ADDSUB instructions!
10552 // Convert to blend(fsub,fadd).
10553 if (VT.is512BitVector()) {
10554 SmallVector<int> Mask;
10555 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10556 Mask.push_back(I);
10557 Mask.push_back(I + E + 1);
10558 }
10559 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10560 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10561 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10562 }
10563
10564 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10565}
10566
10567static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10568 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10569 // Initialize outputs to known values.
10570 MVT VT = BV->getSimpleValueType(0);
10571 HOpcode = ISD::DELETED_NODE;
10572 V0 = DAG.getUNDEF(VT);
10573 V1 = DAG.getUNDEF(VT);
10574
10575 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10576 // half of the result is calculated independently from the 128-bit halves of
10577 // the inputs, so that makes the index-checking logic below more complicated.
10578 unsigned NumElts = VT.getVectorNumElements();
10579 unsigned GenericOpcode = ISD::DELETED_NODE;
10580 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10581 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10582 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10583 for (unsigned i = 0; i != Num128BitChunks; ++i) {
10584 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10585 // Ignore undef elements.
10586 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10587 if (Op.isUndef())
10588 continue;
10589
10590 // If there's an opcode mismatch, we're done.
10591 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10592 return false;
10593
10594 // Initialize horizontal opcode.
10595 if (HOpcode == ISD::DELETED_NODE) {
10596 GenericOpcode = Op.getOpcode();
10597 switch (GenericOpcode) {
10598 case ISD::ADD: HOpcode = X86ISD::HADD; break;
10599 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10600 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10601 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10602 default: return false;
10603 }
10604 }
10605
10606 SDValue Op0 = Op.getOperand(0);
10607 SDValue Op1 = Op.getOperand(1);
10608 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10609 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10610 Op0.getOperand(0) != Op1.getOperand(0) ||
10611 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10612 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10613 return false;
10614
10615 // The source vector is chosen based on which 64-bit half of the
10616 // destination vector is being calculated.
10617 if (j < NumEltsIn64Bits) {
10618 if (V0.isUndef())
10619 V0 = Op0.getOperand(0);
10620 } else {
10621 if (V1.isUndef())
10622 V1 = Op0.getOperand(0);
10623 }
10624
10625 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10626 if (SourceVec != Op0.getOperand(0))
10627 return false;
10628
10629 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10630 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10631 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10632 unsigned ExpectedIndex = i * NumEltsIn128Bits +
10633 (j % NumEltsIn64Bits) * 2;
10634 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10635 continue;
10636
10637 // If this is not a commutative op, this does not match.
10638 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10639 return false;
10640
10641 // Addition is commutative, so try swapping the extract indexes.
10642 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10643 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10644 continue;
10645
10646 // Extract indexes do not match horizontal requirement.
10647 return false;
10648 }
10649 }
10650 // We matched. Opcode and operands are returned by reference as arguments.
10651 return true;
10652}
10653
10654static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10655 SelectionDAG &DAG, unsigned HOpcode,
10656 SDValue V0, SDValue V1) {
10657 // If either input vector is not the same size as the build vector,
10658 // extract/insert the low bits to the correct size.
10659 // This is free (examples: zmm --> xmm, xmm --> ymm).
10660 MVT VT = BV->getSimpleValueType(0);
10661 unsigned Width = VT.getSizeInBits();
10662 if (V0.getValueSizeInBits() > Width)
10663 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10664 else if (V0.getValueSizeInBits() < Width)
10665 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10666
10667 if (V1.getValueSizeInBits() > Width)
10668 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10669 else if (V1.getValueSizeInBits() < Width)
10670 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10671
10672 unsigned NumElts = VT.getVectorNumElements();
10673 APInt DemandedElts = APInt::getAllOnes(NumElts);
10674 for (unsigned i = 0; i != NumElts; ++i)
10675 if (BV->getOperand(i).isUndef())
10676 DemandedElts.clearBit(i);
10677
10678 // If we don't need the upper xmm, then perform as a xmm hop.
10679 unsigned HalfNumElts = NumElts / 2;
10680 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10681 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10682 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10683 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10684 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10685 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10686 }
10687
10688 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10689}
10690
10691/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10692static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10693 const X86Subtarget &Subtarget,
10694 SelectionDAG &DAG) {
10695 // We need at least 2 non-undef elements to make this worthwhile by default.
10696 unsigned NumNonUndefs =
10697 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10698 if (NumNonUndefs < 2)
10699 return SDValue();
10700
10701 // There are 4 sets of horizontal math operations distinguished by type:
10702 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10703 // subtarget feature. Try to match those "native" patterns first.
10704 MVT VT = BV->getSimpleValueType(0);
10705 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10706 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10707 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10708 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10709 unsigned HOpcode;
10710 SDValue V0, V1;
10711 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10712 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10713 }
10714
10715 // Try harder to match 256-bit ops by using extract/concat.
10716 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10717 return SDValue();
10718
10719 // Count the number of UNDEF operands in the build_vector in input.
10720 unsigned NumElts = VT.getVectorNumElements();
10721 unsigned Half = NumElts / 2;
10722 unsigned NumUndefsLO = 0;
10723 unsigned NumUndefsHI = 0;
10724 for (unsigned i = 0, e = Half; i != e; ++i)
10725 if (BV->getOperand(i)->isUndef())
10726 NumUndefsLO++;
10727
10728 for (unsigned i = Half, e = NumElts; i != e; ++i)
10729 if (BV->getOperand(i)->isUndef())
10730 NumUndefsHI++;
10731
10732 SDLoc DL(BV);
10733 SDValue InVec0, InVec1;
10734 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10735 SDValue InVec2, InVec3;
10736 unsigned X86Opcode;
10737 bool CanFold = true;
10738
10739 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10740 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10741 InVec3) &&
10742 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10743 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10744 X86Opcode = X86ISD::HADD;
10745 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10746 InVec1) &&
10747 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10748 InVec3) &&
10749 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10750 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10751 X86Opcode = X86ISD::HSUB;
10752 else
10753 CanFold = false;
10754
10755 if (CanFold) {
10756 // Do not try to expand this build_vector into a pair of horizontal
10757 // add/sub if we can emit a pair of scalar add/sub.
10758 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10759 return SDValue();
10760
10761 // Convert this build_vector into a pair of horizontal binops followed by
10762 // a concat vector. We must adjust the outputs from the partial horizontal
10763 // matching calls above to account for undefined vector halves.
10764 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10765 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10766 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10766, __extension__
__PRETTY_FUNCTION__))
;
10767 bool isUndefLO = NumUndefsLO == Half;
10768 bool isUndefHI = NumUndefsHI == Half;
10769 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10770 isUndefHI);
10771 }
10772 }
10773
10774 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10775 VT == MVT::v16i16) {
10776 unsigned X86Opcode;
10777 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10778 X86Opcode = X86ISD::HADD;
10779 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10780 InVec1))
10781 X86Opcode = X86ISD::HSUB;
10782 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10783 InVec1))
10784 X86Opcode = X86ISD::FHADD;
10785 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10786 InVec1))
10787 X86Opcode = X86ISD::FHSUB;
10788 else
10789 return SDValue();
10790
10791 // Don't try to expand this build_vector into a pair of horizontal add/sub
10792 // if we can simply emit a pair of scalar add/sub.
10793 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10794 return SDValue();
10795
10796 // Convert this build_vector into two horizontal add/sub followed by
10797 // a concat vector.
10798 bool isUndefLO = NumUndefsLO == Half;
10799 bool isUndefHI = NumUndefsHI == Half;
10800 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10801 isUndefLO, isUndefHI);
10802 }
10803
10804 return SDValue();
10805}
10806
10807static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10808 SelectionDAG &DAG);
10809
10810/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10811/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10812/// just apply the bit to the vectors.
10813/// NOTE: Its not in our interest to start make a general purpose vectorizer
10814/// from this, but enough scalar bit operations are created from the later
10815/// legalization + scalarization stages to need basic support.
10816static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10817 const X86Subtarget &Subtarget,
10818 SelectionDAG &DAG) {
10819 SDLoc DL(Op);
10820 MVT VT = Op->getSimpleValueType(0);
10821 unsigned NumElems = VT.getVectorNumElements();
10822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10823
10824 // Check that all elements have the same opcode.
10825 // TODO: Should we allow UNDEFS and if so how many?
10826 unsigned Opcode = Op->getOperand(0).getOpcode();
10827 for (unsigned i = 1; i < NumElems; ++i)
10828 if (Opcode != Op->getOperand(i).getOpcode())
10829 return SDValue();
10830
10831 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10832 bool IsShift = false;
10833 switch (Opcode) {
10834 default:
10835 return SDValue();
10836 case ISD::SHL:
10837 case ISD::SRL:
10838 case ISD::SRA:
10839 IsShift = true;
10840 break;
10841 case ISD::AND:
10842 case ISD::XOR:
10843 case ISD::OR:
10844 // Don't do this if the buildvector is a splat - we'd replace one
10845 // constant with an entire vector.
10846 if (Op->getSplatValue())
10847 return SDValue();
10848 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10849 return SDValue();
10850 break;
10851 }
10852
10853 SmallVector<SDValue, 4> LHSElts, RHSElts;
10854 for (SDValue Elt : Op->ops()) {
10855 SDValue LHS = Elt.getOperand(0);
10856 SDValue RHS = Elt.getOperand(1);
10857
10858 // We expect the canonicalized RHS operand to be the constant.
10859 if (!isa<ConstantSDNode>(RHS))
10860 return SDValue();
10861
10862 // Extend shift amounts.
10863 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10864 if (!IsShift)
10865 return SDValue();
10866 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10867 }
10868
10869 LHSElts.push_back(LHS);
10870 RHSElts.push_back(RHS);
10871 }
10872
10873 // Limit to shifts by uniform immediates.
10874 // TODO: Only accept vXi8/vXi64 special cases?
10875 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10876 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10877 return SDValue();
10878
10879 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10880 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10881 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10882
10883 if (!IsShift)
10884 return Res;
10885
10886 // Immediately lower the shift to ensure the constant build vector doesn't
10887 // get converted to a constant pool before the shift is lowered.
10888 return LowerShift(Res, Subtarget, DAG);
10889}
10890
10891/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10892/// functionality to do this, so it's all zeros, all ones, or some derivation
10893/// that is cheap to calculate.
10894static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10895 const X86Subtarget &Subtarget) {
10896 SDLoc DL(Op);
10897 MVT VT = Op.getSimpleValueType();
10898
10899 // Vectors containing all zeros can be matched by pxor and xorps.
10900 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10901 return Op;
10902
10903 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10904 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10905 // vpcmpeqd on 256-bit vectors.
10906 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10907 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10908 return Op;
10909
10910 return getOnesVector(VT, DAG, DL);
10911 }
10912
10913 return SDValue();
10914}
10915
10916/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10917/// from a vector of source values and a vector of extraction indices.
10918/// The vectors might be manipulated to match the type of the permute op.
10919static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10920 SDLoc &DL, SelectionDAG &DAG,
10921 const X86Subtarget &Subtarget) {
10922 MVT ShuffleVT = VT;
10923 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10924 unsigned NumElts = VT.getVectorNumElements();
10925 unsigned SizeInBits = VT.getSizeInBits();
10926
10927 // Adjust IndicesVec to match VT size.
10928 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__
__PRETTY_FUNCTION__))
10929 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__
__PRETTY_FUNCTION__))
;
10930 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10931 // Narrow/widen the indices vector to the correct size.
10932 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10933 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10934 NumElts * VT.getScalarSizeInBits());
10935 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10936 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10937 SDLoc(IndicesVec), SizeInBits);
10938 // Zero-extend the index elements within the vector.
10939 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10940 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10941 IndicesVT, IndicesVec);
10942 }
10943 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10944
10945 // Handle SrcVec that don't match VT type.
10946 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10947 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10948 // Handle larger SrcVec by treating it as a larger permute.
10949 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10950 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10951 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10952 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10953 Subtarget, DAG, SDLoc(IndicesVec));
10954 SDValue NewSrcVec =
10955 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10956 if (NewSrcVec)
10957 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10958 return SDValue();
10959 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10960 // Widen smaller SrcVec to match VT.
10961 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10962 } else
10963 return SDValue();
10964 }
10965
10966 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10967 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10967, __extension__
__PRETTY_FUNCTION__))
;
10968 EVT SrcVT = Idx.getValueType();
10969 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10970 uint64_t IndexScale = 0;
10971 uint64_t IndexOffset = 0;
10972
10973 // If we're scaling a smaller permute op, then we need to repeat the
10974 // indices, scaling and offsetting them as well.
10975 // e.g. v4i32 -> v16i8 (Scale = 4)
10976 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10977 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10978 for (uint64_t i = 0; i != Scale; ++i) {
10979 IndexScale |= Scale << (i * NumDstBits);
10980 IndexOffset |= i << (i * NumDstBits);
10981 }
10982
10983 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10984 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10985 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10986 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10987 return Idx;
10988 };
10989
10990 unsigned Opcode = 0;
10991 switch (VT.SimpleTy) {
10992 default:
10993 break;
10994 case MVT::v16i8:
10995 if (Subtarget.hasSSSE3())
10996 Opcode = X86ISD::PSHUFB;
10997 break;
10998 case MVT::v8i16:
10999 if (Subtarget.hasVLX() && Subtarget.hasBWI())
11000 Opcode = X86ISD::VPERMV;
11001 else if (Subtarget.hasSSSE3()) {
11002 Opcode = X86ISD::PSHUFB;
11003 ShuffleVT = MVT::v16i8;
11004 }
11005 break;
11006 case MVT::v4f32:
11007 case MVT::v4i32:
11008 if (Subtarget.hasAVX()) {
11009 Opcode = X86ISD::VPERMILPV;
11010 ShuffleVT = MVT::v4f32;
11011 } else if (Subtarget.hasSSSE3()) {
11012 Opcode = X86ISD::PSHUFB;
11013 ShuffleVT = MVT::v16i8;
11014 }
11015 break;
11016 case MVT::v2f64:
11017 case MVT::v2i64:
11018 if (Subtarget.hasAVX()) {
11019 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
11020 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11021 Opcode = X86ISD::VPERMILPV;
11022 ShuffleVT = MVT::v2f64;
11023 } else if (Subtarget.hasSSE41()) {
11024 // SSE41 can compare v2i64 - select between indices 0 and 1.
11025 return DAG.getSelectCC(
11026 DL, IndicesVec,
11027 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
11028 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
11029 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
11030 ISD::CondCode::SETEQ);
11031 }
11032 break;
11033 case MVT::v32i8:
11034 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
11035 Opcode = X86ISD::VPERMV;
11036 else if (Subtarget.hasXOP()) {
11037 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
11038 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
11039 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
11040 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
11041 return DAG.getNode(
11042 ISD::CONCAT_VECTORS, DL, VT,
11043 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
11044 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
11045 } else if (Subtarget.hasAVX()) {
11046 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
11047 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
11048 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
11049 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
11050 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
11051 ArrayRef<SDValue> Ops) {
11052 // Permute Lo and Hi and then select based on index range.
11053 // This works as SHUFB uses bits[3:0] to permute elements and we don't
11054 // care about the bit[7] as its just an index vector.
11055 SDValue Idx = Ops[2];
11056 EVT VT = Idx.getValueType();
11057 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
11058 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
11059 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
11060 ISD::CondCode::SETGT);
11061 };
11062 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
11063 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
11064 PSHUFBBuilder);
11065 }
11066 break;
11067 case MVT::v16i16:
11068 if (Subtarget.hasVLX() && Subtarget.hasBWI())
11069 Opcode = X86ISD::VPERMV;
11070 else if (Subtarget.hasAVX()) {
11071 // Scale to v32i8 and perform as v32i8.
11072 IndicesVec = ScaleIndices(IndicesVec, 2);
11073 return DAG.getBitcast(
11074 VT, createVariablePermute(
11075 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
11076 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
11077 }
11078 break;
11079 case MVT::v8f32:
11080 case MVT::v8i32:
11081 if (Subtarget.hasAVX2())
11082 Opcode = X86ISD::VPERMV;
11083 else if (Subtarget.hasAVX()) {
11084 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
11085 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11086 {0, 1, 2, 3, 0, 1, 2, 3});
11087 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11088 {4, 5, 6, 7, 4, 5, 6, 7});
11089 if (Subtarget.hasXOP())
11090 return DAG.getBitcast(
11091 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
11092 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11093 // Permute Lo and Hi and then select based on index range.
11094 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
11095 SDValue Res = DAG.getSelectCC(
11096 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
11097 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
11098 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
11099 ISD::CondCode::SETGT);
11100 return DAG.getBitcast(VT, Res);
11101 }
11102 break;
11103 case MVT::v4i64:
11104 case MVT::v4f64:
11105 if (Subtarget.hasAVX512()) {
11106 if (!Subtarget.hasVLX()) {
11107 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
11108 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
11109 SDLoc(SrcVec));
11110 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
11111 DAG, SDLoc(IndicesVec));
11112 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
11113 DAG, Subtarget);
11114 return extract256BitVector(Res, 0, DAG, DL);
11115 }
11116 Opcode = X86ISD::VPERMV;
11117 } else if (Subtarget.hasAVX()) {
11118 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
11119 SDValue LoLo =
11120 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
11121 SDValue HiHi =
11122 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
11123 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
11124 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11125 if (Subtarget.hasXOP())
11126 return DAG.getBitcast(
11127 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
11128 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11129 // Permute Lo and Hi and then select based on index range.
11130 // This works as VPERMILPD only uses index bit[1] to permute elements.
11131 SDValue Res = DAG.getSelectCC(
11132 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
11133 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
11134 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
11135 ISD::CondCode::SETGT);
11136 return DAG.getBitcast(VT, Res);
11137 }
11138 break;
11139 case MVT::v64i8:
11140 if (Subtarget.hasVBMI())
11141 Opcode = X86ISD::VPERMV;
11142 break;
11143 case MVT::v32i16:
11144 if (Subtarget.hasBWI())
11145 Opcode = X86ISD::VPERMV;
11146 break;
11147 case MVT::v16f32:
11148 case MVT::v16i32:
11149 case MVT::v8f64:
11150 case MVT::v8i64:
11151 if (Subtarget.hasAVX512())
11152 Opcode = X86ISD::VPERMV;
11153 break;
11154 }
11155 if (!Opcode)
11156 return SDValue();
11157
11158 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))
11159 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))
11160 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))
;
11161
11162 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
11163 if (Scale > 1)
11164 IndicesVec = ScaleIndices(IndicesVec, Scale);
11165
11166 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
11167 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
11168
11169 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
11170 SDValue Res = Opcode == X86ISD::VPERMV
11171 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
11172 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
11173 return DAG.getBitcast(VT, Res);
11174}
11175
11176// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
11177// reasoned to be a permutation of a vector by indices in a non-constant vector.
11178// (build_vector (extract_elt V, (extract_elt I, 0)),
11179// (extract_elt V, (extract_elt I, 1)),
11180// ...
11181// ->
11182// (vpermv I, V)
11183//
11184// TODO: Handle undefs
11185// TODO: Utilize pshufb and zero mask blending to support more efficient
11186// construction of vectors with constant-0 elements.
11187static SDValue
11188LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
11189 const X86Subtarget &Subtarget) {
11190 SDValue SrcVec, IndicesVec;
11191 // Check for a match of the permute source vector and permute index elements.
11192 // This is done by checking that the i-th build_vector operand is of the form:
11193 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
11194 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
11195 SDValue Op = V.getOperand(Idx);
11196 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11197 return SDValue();
11198
11199 // If this is the first extract encountered in V, set the source vector,
11200 // otherwise verify the extract is from the previously defined source
11201 // vector.
11202 if (!SrcVec)
11203 SrcVec = Op.getOperand(0);
11204 else if (SrcVec != Op.getOperand(0))
11205 return SDValue();
11206 SDValue ExtractedIndex = Op->getOperand(1);
11207 // Peek through extends.
11208 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
11209 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
11210 ExtractedIndex = ExtractedIndex.getOperand(0);
11211 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11212 return SDValue();
11213
11214 // If this is the first extract from the index vector candidate, set the
11215 // indices vector, otherwise verify the extract is from the previously
11216 // defined indices vector.
11217 if (!IndicesVec)
11218 IndicesVec = ExtractedIndex.getOperand(0);
11219 else if (IndicesVec != ExtractedIndex.getOperand(0))
11220 return SDValue();
11221
11222 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
11223 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
11224 return SDValue();
11225 }
11226
11227 SDLoc DL(V);
11228 MVT VT = V.getSimpleValueType();
11229 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
11230}
11231
11232SDValue
11233X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
11234 SDLoc dl(Op);
11235
11236 MVT VT = Op.getSimpleValueType();
11237 MVT EltVT = VT.getVectorElementType();
11238 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
11239 unsigned NumElems = Op.getNumOperands();
11240
11241 // Generate vectors for predicate vectors.
11242 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
11243 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
11244
11245 if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
11246 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
11247
11248 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
11249 return VectorConstant;
11250
11251 unsigned EVTBits = EltVT.getSizeInBits();
11252 APInt UndefMask = APInt::getZero(NumElems);
11253 APInt FrozenUndefMask = APInt::getZero(NumElems);
11254 APInt ZeroMask = APInt::getZero(NumElems);
11255 APInt NonZeroMask = APInt::getZero(NumElems);
11256 bool IsAllConstants = true;
11257 bool OneUseFrozenUndefs = true;
11258 SmallSet<SDValue, 8> Values;
11259 unsigned NumConstants = NumElems;
11260 for (unsigned i = 0; i < NumElems; ++i) {
11261 SDValue Elt = Op.getOperand(i);
11262 if (Elt.isUndef()) {
11263 UndefMask.setBit(i);
11264 continue;
11265 }
11266 if (ISD::isFreezeUndef(Elt.getNode())) {
11267 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
11268 FrozenUndefMask.setBit(i);
11269 continue;
11270 }
11271 Values.insert(Elt);
11272 if (!isIntOrFPConstant(Elt)) {
11273 IsAllConstants = false;
11274 NumConstants--;
11275 }
11276 if (X86::isZeroNode(Elt)) {
11277 ZeroMask.setBit(i);
11278 } else {
11279 NonZeroMask.setBit(i);
11280 }
11281 }
11282
11283 // All undef vector. Return an UNDEF.
11284 if (UndefMask.isAllOnes())
11285 return DAG.getUNDEF(VT);
11286
11287 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
11288 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
11289 return DAG.getFreeze(DAG.getUNDEF(VT));
11290
11291 // All undef/freeze(undef)/zero vector. Return a zero vector.
11292 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
11293 return getZeroVector(VT, Subtarget, DAG, dl);
11294
11295 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
11296 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
11297 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
11298 // and blend the FREEZE-UNDEF operands back in.
11299 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
11300 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
11301 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
11302 SmallVector<int, 16> BlendMask(NumElems, -1);
11303 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
11304 for (unsigned i = 0; i < NumElems; ++i) {
11305 if (UndefMask[i]) {
11306 BlendMask[i] = -1;
11307 continue;
11308 }
11309 BlendMask[i] = i;
11310 if (!FrozenUndefMask[i])
11311 Elts[i] = Op.getOperand(i);
11312 else
11313 BlendMask[i] += NumElems;
11314 }
11315 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
11316 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
11317 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
11318 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
11319 }
11320
11321 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
11322
11323 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
11324 // be better off lowering to a smaller build vector and padding with
11325 // undef/zero.
11326 if ((VT.is256BitVector() || VT.is512BitVector()) &&
11327 !isFoldableUseOfShuffle(BV)) {
11328 unsigned UpperElems = NumElems / 2;
11329 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
11330 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
11331 if (NumUpperUndefsOrZeros >= UpperElems) {
11332 if (VT.is512BitVector() &&
11333 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
11334 UpperElems = NumElems - (NumElems / 4);
11335 // If freeze(undef) is in any upper elements, force to zero.
11336 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
11337 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
11338 SDValue NewBV =
11339 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
11340 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
11341 }
11342 }
11343
11344 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
11345 return AddSub;
11346 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
11347 return HorizontalOp;
11348 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
11349 return Broadcast;
11350 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
11351 return BitOp;
11352
11353 unsigned NumZero = ZeroMask.popcount();
11354 unsigned NumNonZero = NonZeroMask.popcount();
11355
11356 // If we are inserting one variable into a vector of non-zero constants, try
11357 // to avoid loading each constant element as a scalar. Load the constants as a
11358 // vector and then insert the variable scalar element. If insertion is not
11359 // supported, fall back to a shuffle to get the scalar blended with the
11360 // constants. Insertion into a zero vector is handled as a special-case
11361 // somewhere below here.
11362 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
11363 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
11364 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
11365 // Create an all-constant vector. The variable element in the old
11366 // build vector is replaced by undef in the constant vector. Save the
11367 // variable scalar element and its index for use in the insertelement.
11368 LLVMContext &Context = *DAG.getContext();
11369 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
11370 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
11371 SDValue VarElt;
11372 SDValue InsIndex;
11373 for (unsigned i = 0; i != NumElems; ++i) {
11374 SDValue Elt = Op.getOperand(i);
11375 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
11376 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
11377 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
11378 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
11379 else if (!Elt.isUndef()) {
11380 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__
__PRETTY_FUNCTION__))
11381 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__
__PRETTY_FUNCTION__))
;
11382 VarElt = Elt;
11383 InsIndex = DAG.getVectorIdxConstant(i, dl);
11384 }
11385 }
11386 Constant *CV = ConstantVector::get(ConstVecOps);
11387 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
11388
11389 // The constants we just created may not be legal (eg, floating point). We
11390 // must lower the vector right here because we can not guarantee that we'll
11391 // legalize it before loading it. This is also why we could not just create
11392 // a new build vector here. If the build vector contains illegal constants,
11393 // it could get split back up into a series of insert elements.
11394 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
11395 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
11396 MachineFunction &MF = DAG.getMachineFunction();
11397 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
11398 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
11399 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
11400 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
11401 if (InsertC < NumEltsInLow128Bits)
11402 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
11403
11404 // There's no good way to insert into the high elements of a >128-bit
11405 // vector, so use shuffles to avoid an extract/insert sequence.
11406 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11406, __extension__
__PRETTY_FUNCTION__))
;
11407 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11407, __extension__
__PRETTY_FUNCTION__))
;
11408 SmallVector<int, 8> ShuffleMask;
11409 unsigned NumElts = VT.getVectorNumElements();
11410 for (unsigned i = 0; i != NumElts; ++i)
11411 ShuffleMask.push_back(i == InsertC ? NumElts : i);
11412 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
11413 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
11414 }
11415
11416 // Special case for single non-zero, non-undef, element.
11417 if (NumNonZero == 1) {
11418 unsigned Idx = NonZeroMask.countr_zero();
11419 SDValue Item = Op.getOperand(Idx);
11420
11421 // If we have a constant or non-constant insertion into the low element of
11422 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
11423 // the rest of the elements. This will be matched as movd/movq/movss/movsd
11424 // depending on what the source datatype is.
11425 if (Idx == 0) {
11426 if (NumZero == 0)
11427 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11428
11429 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
11430 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
11431 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
11432 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))
11433 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))
11434 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))
;
11435 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11436 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
11437 // zero vector.
11438 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11439 }
11440
11441 // We can't directly insert an i8 or i16 into a vector, so zero extend
11442 // it to i32 first.
11443 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
11444 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
11445 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
11446 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
11447 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11448 return DAG.getBitcast(VT, Item);
11449 }
11450 }
11451
11452 // Is it a vector logical left shift?
11453 if (NumElems == 2 && Idx == 1 &&
11454 X86::isZeroNode(Op.getOperand(0)) &&
11455 !X86::isZeroNode(Op.getOperand(1))) {
11456 unsigned NumBits = VT.getSizeInBits();
11457 return getVShift(true, VT,
11458 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11459 VT, Op.getOperand(1)),
11460 NumBits/2, DAG, *this, dl);
11461 }
11462
11463 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11464 return SDValue();
11465
11466 // Otherwise, if this is a vector with i32 or f32 elements, and the element
11467 // is a non-constant being inserted into an element other than the low one,
11468 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
11469 // movd/movss) to move this into the low element, then shuffle it into
11470 // place.
11471 if (EVTBits == 32) {
11472 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11473 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11474 }
11475 }
11476
11477 // Splat is obviously ok. Let legalizer expand it to a shuffle.
11478 if (Values.size() == 1) {
11479 if (EVTBits == 32) {
11480 // Instead of a shuffle like this:
11481 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11482 // Check if it's possible to issue this instead.
11483 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11484 unsigned Idx = NonZeroMask.countr_zero();
11485 SDValue Item = Op.getOperand(Idx);
11486 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11487 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11488 }
11489 return SDValue();
11490 }
11491
11492 // A vector full of immediates; various special cases are already
11493 // handled, so this is best done with a single constant-pool load.
11494 if (IsAllConstants)
11495 return SDValue();
11496
11497 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11498 return V;
11499
11500 // See if we can use a vector load to get all of the elements.
11501 {
11502 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11503 if (SDValue LD =
11504 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11505 return LD;
11506 }
11507
11508 // If this is a splat of pairs of 32-bit elements, we can use a narrower
11509 // build_vector and broadcast it.
11510 // TODO: We could probably generalize this more.
11511 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11512 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11513 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11514 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11515 // Make sure all the even/odd operands match.
11516 for (unsigned i = 2; i != NumElems; ++i)
11517 if (Ops[i % 2] != Op.getOperand(i))
11518 return false;
11519 return true;
11520 };
11521 if (CanSplat(Op, NumElems, Ops)) {
11522 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11523 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11524 // Create a new build vector and cast to v2i64/v2f64.
11525 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11526 DAG.getBuildVector(NarrowVT, dl, Ops));
11527 // Broadcast from v2i64/v2f64 and cast to final VT.
11528 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11529 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11530 NewBV));
11531 }
11532 }
11533
11534 // For AVX-length vectors, build the individual 128-bit pieces and use
11535 // shuffles to put them in place.
11536 if (VT.getSizeInBits() > 128) {
11537 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11538
11539 // Build both the lower and upper subvector.
11540 SDValue Lower =
11541 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11542 SDValue Upper = DAG.getBuildVector(
11543 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11544
11545 // Recreate the wider vector with the lower and upper part.
11546 return concatSubVectors(Lower, Upper, DAG, dl);
11547 }
11548
11549 // Let legalizer expand 2-wide build_vectors.
11550 if (EVTBits == 64) {
11551 if (NumNonZero == 1) {
11552 // One half is zero or undef.
11553 unsigned Idx = NonZeroMask.countr_zero();
11554 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11555 Op.getOperand(Idx));
11556 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11557 }
11558 return SDValue();
11559 }
11560
11561 // If element VT is < 32 bits, convert it to inserts into a zero vector.
11562 if (EVTBits == 8 && NumElems == 16)
11563 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11564 DAG, Subtarget))
11565 return V;
11566
11567 if (EltVT == MVT::i16 && NumElems == 8)
11568 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11569 DAG, Subtarget))
11570 return V;
11571
11572 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11573 if (EVTBits == 32 && NumElems == 4)
11574 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11575 return V;
11576
11577 // If element VT is == 32 bits, turn it into a number of shuffles.
11578 if (NumElems == 4 && NumZero > 0) {
11579 SmallVector<SDValue, 8> Ops(NumElems);
11580 for (unsigned i = 0; i < 4; ++i) {
11581 bool isZero = !NonZeroMask[i];
11582 if (isZero)
11583 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11584 else
11585 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11586 }
11587
11588 for (unsigned i = 0; i < 2; ++i) {
11589 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11590 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11590)
;
11591 case 0:
11592 Ops[i] = Ops[i*2]; // Must be a zero vector.
11593 break;
11594 case 1:
11595 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11596 break;
11597 case 2:
11598 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11599 break;
11600 case 3:
11601 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11602 break;
11603 }
11604 }
11605
11606 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11607 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11608 int MaskVec[] = {
11609 Reverse1 ? 1 : 0,
11610 Reverse1 ? 0 : 1,
11611 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11612 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
11613 };
11614 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11615 }
11616
11617 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11617, __extension__
__PRETTY_FUNCTION__))
;
11618
11619 // Check for a build vector from mostly shuffle plus few inserting.
11620 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11621 return Sh;
11622
11623 // For SSE 4.1, use insertps to put the high elements into the low element.
11624 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11625 SDValue Result;
11626 if (!Op.getOperand(0).isUndef())
11627 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11628 else
11629 Result = DAG.getUNDEF(VT);
11630
11631 for (unsigned i = 1; i < NumElems; ++i) {
11632 if (Op.getOperand(i).isUndef()) continue;
11633 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11634 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11635 }
11636 return Result;
11637 }
11638
11639 // Otherwise, expand into a number of unpckl*, start by extending each of
11640 // our (non-undef) elements to the full vector width with the element in the
11641 // bottom slot of the vector (which generates no code for SSE).
11642 SmallVector<SDValue, 8> Ops(NumElems);
11643 for (unsigned i = 0; i < NumElems; ++i) {
11644 if (!Op.getOperand(i).isUndef())
11645 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11646 else
11647 Ops[i] = DAG.getUNDEF(VT);
11648 }
11649
11650 // Next, we iteratively mix elements, e.g. for v4f32:
11651 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11652 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11653 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
11654 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11655 // Generate scaled UNPCKL shuffle mask.
11656 SmallVector<int, 16> Mask;
11657 for(unsigned i = 0; i != Scale; ++i)
11658 Mask.push_back(i);
11659 for (unsigned i = 0; i != Scale; ++i)
11660 Mask.push_back(NumElems+i);
11661 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11662
11663 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11664 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11665 }
11666 return Ops[0];
11667}
11668
11669// 256-bit AVX can use the vinsertf128 instruction
11670// to create 256-bit vectors from two other 128-bit ones.
11671// TODO: Detect subvector broadcast here instead of DAG combine?
11672static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11673 const X86Subtarget &Subtarget) {
11674 SDLoc dl(Op);
11675 MVT ResVT = Op.getSimpleValueType();
11676
11677 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__
__PRETTY_FUNCTION__))
11678 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__
__PRETTY_FUNCTION__))
;
11679
11680 unsigned NumOperands = Op.getNumOperands();
11681 unsigned NumFreezeUndef = 0;
11682 unsigned NumZero = 0;
11683 unsigned NumNonZero = 0;
11684 unsigned NonZeros = 0;
11685 for (unsigned i = 0; i != NumOperands; ++i) {
11686 SDValue SubVec = Op.getOperand(i);
11687 if (SubVec.isUndef())
11688 continue;
11689 if (ISD::isFreezeUndef(SubVec.getNode())) {
11690 // If the freeze(undef) has multiple uses then we must fold to zero.
11691 if (SubVec.hasOneUse())
11692 ++NumFreezeUndef;
11693 else
11694 ++NumZero;
11695 }
11696 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11697 ++NumZero;
11698 else {
11699 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11699, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11700 NonZeros |= 1 << i;
11701 ++NumNonZero;
11702 }
11703 }
11704
11705 // If we have more than 2 non-zeros, build each half separately.
11706 if (NumNonZero > 2) {
11707 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11708 ArrayRef<SDUse> Ops = Op->ops();
11709 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11710 Ops.slice(0, NumOperands/2));
11711 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11712 Ops.slice(NumOperands/2));
11713 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11714 }
11715
11716 // Otherwise, build it up through insert_subvectors.
11717 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11718 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
11719 : DAG.getUNDEF(ResVT));
11720
11721 MVT SubVT = Op.getOperand(0).getSimpleValueType();
11722 unsigned NumSubElems = SubVT.getVectorNumElements();
11723 for (unsigned i = 0; i != NumOperands; ++i) {
11724 if ((NonZeros & (1 << i)) == 0)
11725 continue;
11726
11727 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11728 Op.getOperand(i),
11729 DAG.getIntPtrConstant(i * NumSubElems, dl));
11730 }
11731
11732 return Vec;
11733}
11734
11735// Returns true if the given node is a type promotion (by concatenating i1
11736// zeros) of the result of a node that already zeros all upper bits of
11737// k-register.
11738// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11739static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11740 const X86Subtarget &Subtarget,
11741 SelectionDAG & DAG) {
11742 SDLoc dl(Op);
11743 MVT ResVT = Op.getSimpleValueType();
11744 unsigned NumOperands = Op.getNumOperands();
11745
11746 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__
__PRETTY_FUNCTION__))
11747 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__
__PRETTY_FUNCTION__))
;
11748
11749 uint64_t Zeros = 0;
11750 uint64_t NonZeros = 0;
11751 for (unsigned i = 0; i != NumOperands; ++i) {
11752 SDValue SubVec = Op.getOperand(i);
11753 if (SubVec.isUndef())
11754 continue;
11755 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11755, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11756 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11757 Zeros |= (uint64_t)1 << i;
11758 else
11759 NonZeros |= (uint64_t)1 << i;
11760 }
11761
11762 unsigned NumElems = ResVT.getVectorNumElements();
11763
11764 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11765 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11766 // insert_subvector will give us two kshifts.
11767 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11768 Log2_64(NonZeros) != NumOperands - 1) {
11769 MVT ShiftVT = ResVT;
11770 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11771 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11772 unsigned Idx = Log2_64(NonZeros);
11773 SDValue SubVec = Op.getOperand(Idx);
11774 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11775 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11776 DAG.getUNDEF(ShiftVT), SubVec,
11777 DAG.getIntPtrConstant(0, dl));
11778 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11779 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11780 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11781 DAG.getIntPtrConstant(0, dl));
11782 }
11783
11784 // If there are zero or one non-zeros we can handle this very simply.
11785 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11786 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11787 if (!NonZeros)
11788 return Vec;
11789 unsigned Idx = Log2_64(NonZeros);
11790 SDValue SubVec = Op.getOperand(Idx);
11791 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11792 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11793 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11794 }
11795
11796 if (NumOperands > 2) {
11797 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11798 ArrayRef<SDUse> Ops = Op->ops();
11799 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11800 Ops.slice(0, NumOperands/2));
11801 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11802 Ops.slice(NumOperands/2));
11803 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11804 }
11805
11806 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11806, __extension__
__PRETTY_FUNCTION__))
;
11807
11808 if (ResVT.getVectorNumElements() >= 16)
11809 return Op; // The operation is legal with KUNPCK
11810
11811 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11812 DAG.getUNDEF(ResVT), Op.getOperand(0),
11813 DAG.getIntPtrConstant(0, dl));
11814 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11815 DAG.getIntPtrConstant(NumElems/2, dl));
11816}
11817
11818static SDValue LowerCONCAT_VECTORS(SDValue Op,
11819 const X86Subtarget &Subtarget,
11820 SelectionDAG &DAG) {
11821 MVT VT = Op.getSimpleValueType();
11822 if (VT.getVectorElementType() == MVT::i1)
11823 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11824
11825 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))
11826 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))
11827 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))
;
11828
11829 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11830 // from two other 128-bit ones.
11831
11832 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11833 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11834}
11835
11836//===----------------------------------------------------------------------===//
11837// Vector shuffle lowering
11838//
11839// This is an experimental code path for lowering vector shuffles on x86. It is
11840// designed to handle arbitrary vector shuffles and blends, gracefully
11841// degrading performance as necessary. It works hard to recognize idiomatic
11842// shuffles and lower them to optimal instruction patterns without leaving
11843// a framework that allows reasonably efficient handling of all vector shuffle
11844// patterns.
11845//===----------------------------------------------------------------------===//
11846
11847/// Tiny helper function to identify a no-op mask.
11848///
11849/// This is a somewhat boring predicate function. It checks whether the mask
11850/// array input, which is assumed to be a single-input shuffle mask of the kind
11851/// used by the X86 shuffle instructions (not a fully general
11852/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11853/// in-place shuffle are 'no-op's.
11854static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11855 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11856 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
;
11857 if (Mask[i] >= 0 && Mask[i] != i)
11858 return false;
11859 }
11860 return true;
11861}
11862
11863/// Test whether there are elements crossing LaneSizeInBits lanes in this
11864/// shuffle mask.
11865///
11866/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11867/// and we routinely test for these.
11868static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11869 unsigned ScalarSizeInBits,
11870 ArrayRef<int> Mask) {
11871 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))
11872 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))
11873 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))
;
11874 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11875 int Size = Mask.size();
11876 for (int i = 0; i < Size; ++i)
11877 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11878 return true;
11879 return false;
11880}
11881
11882/// Test whether there are elements crossing 128-bit lanes in this
11883/// shuffle mask.
11884static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11885 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11886}
11887
11888/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11889/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11890/// better support 'repeated mask + lane permute' style shuffles.
11891static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11892 unsigned ScalarSizeInBits,
11893 ArrayRef<int> Mask) {
11894 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))
11895 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))
11896 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))
;
11897 int NumElts = Mask.size();
11898 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11899 int NumLanes = NumElts / NumEltsPerLane;
11900 if (NumLanes > 1) {
11901 for (int i = 0; i != NumLanes; ++i) {
11902 int SrcLane = -1;
11903 for (int j = 0; j != NumEltsPerLane; ++j) {
11904 int M = Mask[(i * NumEltsPerLane) + j];
11905 if (M < 0)
11906 continue;
11907 int Lane = (M % NumElts) / NumEltsPerLane;
11908 if (SrcLane >= 0 && SrcLane != Lane)
11909 return true;
11910 SrcLane = Lane;
11911 }
11912 }
11913 }
11914 return false;
11915}
11916
11917/// Test whether a shuffle mask is equivalent within each sub-lane.
11918///
11919/// This checks a shuffle mask to see if it is performing the same
11920/// lane-relative shuffle in each sub-lane. This trivially implies
11921/// that it is also not lane-crossing. It may however involve a blend from the
11922/// same lane of a second vector.
11923///
11924/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11925/// non-trivial to compute in the face of undef lanes. The representation is
11926/// suitable for use with existing 128-bit shuffles as entries from the second
11927/// vector have been remapped to [LaneSize, 2*LaneSize).
11928static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11929 ArrayRef<int> Mask,
11930 SmallVectorImpl<int> &RepeatedMask) {
11931 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11932 RepeatedMask.assign(LaneSize, -1);
11933 int Size = Mask.size();
11934 for (int i = 0; i < Size; ++i) {
11935 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11935, __extension__
__PRETTY_FUNCTION__))
;
11936 if (Mask[i] < 0)
11937 continue;
11938 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11939 // This entry crosses lanes, so there is no way to model this shuffle.
11940 return false;
11941
11942 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11943 // Adjust second vector indices to start at LaneSize instead of Size.
11944 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11945 : Mask[i] % LaneSize + LaneSize;
11946 if (RepeatedMask[i % LaneSize] < 0)
11947 // This is the first non-undef entry in this slot of a 128-bit lane.
11948 RepeatedMask[i % LaneSize] = LocalM;
11949 else if (RepeatedMask[i % LaneSize] != LocalM)
11950 // Found a mismatch with the repeated mask.
11951 return false;
11952 }
11953 return true;
11954}
11955
11956/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11957static bool
11958is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11959 SmallVectorImpl<int> &RepeatedMask) {
11960 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11961}
11962
11963static bool
11964is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11965 SmallVector<int, 32> RepeatedMask;
11966 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11967}
11968
11969/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11970static bool
11971is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11972 SmallVectorImpl<int> &RepeatedMask) {
11973 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11974}
11975
11976/// Test whether a target shuffle mask is equivalent within each sub-lane.
11977/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11978static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11979 unsigned EltSizeInBits,
11980 ArrayRef<int> Mask,
11981 SmallVectorImpl<int> &RepeatedMask) {
11982 int LaneSize = LaneSizeInBits / EltSizeInBits;
11983 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11984 int Size = Mask.size();
11985 for (int i = 0; i < Size; ++i) {
11986 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11986, __extension__
__PRETTY_FUNCTION__))
;
11987 if (Mask[i] == SM_SentinelUndef)
11988 continue;
11989 if (Mask[i] == SM_SentinelZero) {
11990 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11991 return false;
11992 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11993 continue;
11994 }
11995 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11996 // This entry crosses lanes, so there is no way to model this shuffle.
11997 return false;
11998
11999 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
12000 // later vector indices to start at multiples of LaneSize instead of Size.
12001 int LaneM = Mask[i] / Size;
12002 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
12003 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
12004 // This is the first non-undef entry in this slot of a 128-bit lane.
12005 RepeatedMask[i % LaneSize] = LocalM;
12006 else if (RepeatedMask[i % LaneSize] != LocalM)
12007 // Found a mismatch with the repeated mask.
12008 return false;
12009 }
12010 return true;
12011}
12012
12013/// Test whether a target shuffle mask is equivalent within each sub-lane.
12014/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
12015static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
12016 ArrayRef<int> Mask,
12017 SmallVectorImpl<int> &RepeatedMask) {
12018 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
12019 Mask, RepeatedMask);
12020}
12021
12022/// Checks whether the vector elements referenced by two shuffle masks are
12023/// equivalent.
12024static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
12025 int Idx, int ExpectedIdx) {
12026 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__
__PRETTY_FUNCTION__))
12027 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__
__PRETTY_FUNCTION__))
;
12028 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
12029 return false;
12030
12031 switch (Op.getOpcode()) {
12032 case ISD::BUILD_VECTOR:
12033 // If the values are build vectors, we can look through them to find
12034 // equivalent inputs that make the shuffles equivalent.
12035 // TODO: Handle MaskSize != Op.getNumOperands()?
12036 if (MaskSize == (int)Op.getNumOperands() &&
12037 MaskSize == (int)ExpectedOp.getNumOperands())
12038 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
12039 break;
12040 case X86ISD::VBROADCAST:
12041 case X86ISD::VBROADCAST_LOAD:
12042 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
12043 return (Op == ExpectedOp &&
12044 (int)Op.getValueType().getVectorNumElements() == MaskSize);
12045 case X86ISD::HADD:
12046 case X86ISD::HSUB:
12047 case X86ISD::FHADD:
12048 case X86ISD::FHSUB:
12049 case X86ISD::PACKSS:
12050 case X86ISD::PACKUS:
12051 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
12052 // TODO: Handle MaskSize != NumElts?
12053 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
12054 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
12055 MVT VT = Op.getSimpleValueType();
12056 int NumElts = VT.getVectorNumElements();
12057 if (MaskSize == NumElts) {
12058 int NumLanes = VT.getSizeInBits() / 128;
12059 int NumEltsPerLane = NumElts / NumLanes;
12060 int NumHalfEltsPerLane = NumEltsPerLane / 2;
12061 bool SameLane =
12062 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
12063 bool SameElt =
12064 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
12065 return SameLane && SameElt;
12066 }
12067 }
12068 break;
12069 }
12070
12071 return false;
12072}
12073
12074/// Checks whether a shuffle mask is equivalent to an explicit list of
12075/// arguments.
12076///
12077/// This is a fast way to test a shuffle mask against a fixed pattern:
12078///
12079/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
12080///
12081/// It returns true if the mask is exactly as wide as the argument list, and
12082/// each element of the mask is either -1 (signifying undef) or the value given
12083/// in the argument.
12084static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
12085 SDValue V1 = SDValue(),
12086 SDValue V2 = SDValue()) {
12087 int Size = Mask.size();
12088 if (Size != (int)ExpectedMask.size())
12089 return false;
12090
12091 for (int i = 0; i < Size; ++i) {
12092 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12092, __extension__
__PRETTY_FUNCTION__))
;
12093 int MaskIdx = Mask[i];
12094 int ExpectedIdx = ExpectedMask[i];
12095 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
12096 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12097 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12098 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12099 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12100 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12101 return false;
12102 }
12103 }
12104 return true;
12105}
12106
12107/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
12108///
12109/// The masks must be exactly the same width.
12110///
12111/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
12112/// value in ExpectedMask is always accepted. Otherwise the indices must match.
12113///
12114/// SM_SentinelZero is accepted as a valid negative index but must match in
12115/// both, or via a known bits test.
12116static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
12117 ArrayRef<int> ExpectedMask,
12118 const SelectionDAG &DAG,
12119 SDValue V1 = SDValue(),
12120 SDValue V2 = SDValue()) {
12121 int Size = Mask.size();
12122 if (Size != (int)ExpectedMask.size())
12123 return false;
12124 assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))
12125 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))
12126 "Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))
;
12127
12128 // Check for out-of-range target shuffle mask indices.
12129 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
12130 return false;
12131
12132 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
12133 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
12134 V1 = SDValue();
12135 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
12136 V2 = SDValue();
12137
12138 APInt ZeroV1 = APInt::getZero(Size);
12139 APInt ZeroV2 = APInt::getZero(Size);
12140
12141 for (int i = 0; i < Size; ++i) {
12142 int MaskIdx = Mask[i];
12143 int ExpectedIdx = ExpectedMask[i];
12144 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
12145 continue;
12146 if (MaskIdx == SM_SentinelZero) {
12147 // If we need this expected index to be a zero element, then update the
12148 // relevant zero mask and perform the known bits at the end to minimize
12149 // repeated computes.
12150 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12151 if (ExpectedV &&
12152 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
12153 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12154 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
12155 ZeroMask.setBit(BitIdx);
12156 continue;
12157 }
12158 }
12159 if (MaskIdx >= 0) {
12160 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12161 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12162 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12163 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12164 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12165 continue;
12166 }
12167 return false;
12168 }
12169 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
12170 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
12171}
12172
12173// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
12174// instructions.
12175static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
12176 const SelectionDAG &DAG) {
12177 if (VT != MVT::v8i32 && VT != MVT::v8f32)
12178 return false;
12179
12180 SmallVector<int, 8> Unpcklwd;
12181 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
12182 /* Unary = */ false);
12183 SmallVector<int, 8> Unpckhwd;
12184 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
12185 /* Unary = */ false);
12186 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
12187 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
12188 return IsUnpackwdMask;
12189}
12190
12191static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
12192 const SelectionDAG &DAG) {
12193 // Create 128-bit vector type based on mask size.
12194 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
12195 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
12196
12197 // We can't assume a canonical shuffle mask, so try the commuted version too.
12198 SmallVector<int, 4> CommutedMask(Mask);
12199 ShuffleVectorSDNode::commuteMask(CommutedMask);
12200
12201 // Match any of unary/binary or low/high.
12202 for (unsigned i = 0; i != 4; ++i) {
12203 SmallVector<int, 16> UnpackMask;
12204 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
12205 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
12206 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
12207 return true;
12208 }
12209 return false;
12210}
12211
12212/// Return true if a shuffle mask chooses elements identically in its top and
12213/// bottom halves. For example, any splat mask has the same top and bottom
12214/// halves. If an element is undefined in only one half of the mask, the halves
12215/// are not considered identical.
12216static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
12217 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12217, __extension__
__PRETTY_FUNCTION__))
;
12218 unsigned HalfSize = Mask.size() / 2;
12219 for (unsigned i = 0; i != HalfSize; ++i) {
12220 if (Mask[i] != Mask[i + HalfSize])
12221 return false;
12222 }
12223 return true;
12224}
12225
12226/// Get a 4-lane 8-bit shuffle immediate for a mask.
12227///
12228/// This helper function produces an 8-bit shuffle immediate corresponding to
12229/// the ubiquitous shuffle encoding scheme used in x86 instructions for
12230/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
12231/// example.
12232///
12233/// NB: We rely heavily on "undef" masks preserving the input lane.
12234static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
12235 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12235, __extension__
__PRETTY_FUNCTION__))
;
12236 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12236, __extension__
__PRETTY_FUNCTION__))
;
12237 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__))
;
12238 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12238, __extension__
__PRETTY_FUNCTION__))
;
12239 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__
__PRETTY_FUNCTION__))
;
12240
12241 // If the mask only uses one non-undef element, then fully 'splat' it to
12242 // improve later broadcast matching.
12243 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
12244 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12244, __extension__
__PRETTY_FUNCTION__))
;
12245
12246 int FirstElt = Mask[FirstIndex];
12247 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
12248 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
12249
12250 unsigned Imm = 0;
12251 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
12252 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
12253 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
12254 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
12255 return Imm;
12256}
12257
12258static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
12259 SelectionDAG &DAG) {
12260 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
12261}
12262
12263// The Shuffle result is as follow:
12264// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
12265// Each Zeroable's element correspond to a particular Mask's element.
12266// As described in computeZeroableShuffleElements function.
12267//
12268// The function looks for a sub-mask that the nonzero elements are in
12269// increasing order. If such sub-mask exist. The function returns true.
12270static bool isNonZeroElementsInOrder(const APInt &Zeroable,
12271 ArrayRef<int> Mask, const EVT &VectorType,
12272 bool &IsZeroSideLeft) {
12273 int NextElement = -1;
12274 // Check if the Mask's nonzero elements are in increasing order.
12275 for (int i = 0, e = Mask.size(); i < e; i++) {
12276 // Checks if the mask's zeros elements are built from only zeros.
12277 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12277, __extension__
__PRETTY_FUNCTION__))
;
12278 if (Mask[i] < 0)
12279 return false;
12280 if (Zeroable[i])
12281 continue;
12282 // Find the lowest non zero element
12283 if (NextElement < 0) {
12284 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
12285 IsZeroSideLeft = NextElement != 0;
12286 }
12287 // Exit if the mask's non zero elements are not in increasing order.
12288 if (NextElement != Mask[i])
12289 return false;
12290 NextElement++;
12291 }
12292 return true;
12293}
12294
12295/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
12296static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
12297 ArrayRef<int> Mask, SDValue V1,
12298 SDValue V2, const APInt &Zeroable,
12299 const X86Subtarget &Subtarget,
12300 SelectionDAG &DAG) {
12301 int Size = Mask.size();
12302 int LaneSize = 128 / VT.getScalarSizeInBits();
12303 const int NumBytes = VT.getSizeInBits() / 8;
12304 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
12305
12306 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))
12307 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))
12308 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))
;
12309
12310 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
12311 // Sign bit set in i8 mask means zero element.
12312 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
12313
12314 SDValue V;
12315 for (int i = 0; i < NumBytes; ++i) {
12316 int M = Mask[i / NumEltBytes];
12317 if (M < 0) {
12318 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
12319 continue;
12320 }
12321 if (Zeroable[i / NumEltBytes]) {
12322 PSHUFBMask[i] = ZeroMask;
12323 continue;
12324 }
12325
12326 // We can only use a single input of V1 or V2.
12327 SDValue SrcV = (M >= Size ? V2 : V1);
12328 if (V && V != SrcV)
12329 return SDValue();
12330 V = SrcV;
12331 M %= Size;
12332
12333 // PSHUFB can't cross lanes, ensure this doesn't happen.
12334 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
12335 return SDValue();
12336
12337 M = M % LaneSize;
12338 M = M * NumEltBytes + (i % NumEltBytes);
12339 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
12340 }
12341 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12341, __extension__
__PRETTY_FUNCTION__))
;
12342
12343 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
12344 return DAG.getBitcast(
12345 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
12346 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
12347}
12348
12349static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
12350 const X86Subtarget &Subtarget, SelectionDAG &DAG,
12351 const SDLoc &dl);
12352
12353// X86 has dedicated shuffle that can be lowered to VEXPAND
12354static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
12355 const APInt &Zeroable,
12356 ArrayRef<int> Mask, SDValue &V1,
12357 SDValue &V2, SelectionDAG &DAG,
12358 const X86Subtarget &Subtarget) {
12359 bool IsLeftZeroSide = true;
12360 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
12361 IsLeftZeroSide))
12362 return SDValue();
12363 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
12364 MVT IntegerType =
12365 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12366 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
12367 unsigned NumElts = VT.getVectorNumElements();
12368 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__
__PRETTY_FUNCTION__))
12369 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__
__PRETTY_FUNCTION__))
;
12370 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
12371 Subtarget, DAG, DL);
12372 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
12373 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
12374 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
12375}
12376
12377static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
12378 unsigned &UnpackOpcode, bool IsUnary,
12379 ArrayRef<int> TargetMask, const SDLoc &DL,
12380 SelectionDAG &DAG,
12381 const X86Subtarget &Subtarget) {
12382 int NumElts = VT.getVectorNumElements();
12383
12384 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
12385 for (int i = 0; i != NumElts; i += 2) {
12386 int M1 = TargetMask[i + 0];
12387 int M2 = TargetMask[i + 1];
12388 Undef1 &= (SM_SentinelUndef == M1);
12389 Undef2 &= (SM_SentinelUndef == M2);
12390 Zero1 &= isUndefOrZero(M1);
12391 Zero2 &= isUndefOrZero(M2);
12392 }
12393 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__
__PRETTY_FUNCTION__))
12394 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__
__PRETTY_FUNCTION__))
;
12395
12396 // Attempt to match the target mask against the unpack lo/hi mask patterns.
12397 SmallVector<int, 64> Unpckl, Unpckh;
12398 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
12399 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
12400 (IsUnary ? V1 : V2))) {
12401 UnpackOpcode = X86ISD::UNPCKL;
12402 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12403 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12404 return true;
12405 }
12406
12407 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
12408 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
12409 (IsUnary ? V1 : V2))) {
12410 UnpackOpcode = X86ISD::UNPCKH;
12411 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12412 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12413 return true;
12414 }
12415
12416 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
12417 if (IsUnary && (Zero1 || Zero2)) {
12418 // Don't bother if we can blend instead.
12419 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
12420 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
12421 return false;
12422
12423 bool MatchLo = true, MatchHi = true;
12424 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
12425 int M = TargetMask[i];
12426
12427 // Ignore if the input is known to be zero or the index is undef.
12428 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
12429 (M == SM_SentinelUndef))
12430 continue;
12431
12432 MatchLo &= (M == Unpckl[i]);
12433 MatchHi &= (M == Unpckh[i]);
12434 }
12435
12436 if (MatchLo || MatchHi) {
12437 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12438 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12439 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12440 return true;
12441 }
12442 }
12443
12444 // If a binary shuffle, commute and try again.
12445 if (!IsUnary) {
12446 ShuffleVectorSDNode::commuteMask(Unpckl);
12447 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
12448 UnpackOpcode = X86ISD::UNPCKL;
12449 std::swap(V1, V2);
12450 return true;
12451 }
12452
12453 ShuffleVectorSDNode::commuteMask(Unpckh);
12454 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
12455 UnpackOpcode = X86ISD::UNPCKH;
12456 std::swap(V1, V2);
12457 return true;
12458 }
12459 }
12460
12461 return false;
12462}
12463
12464// X86 has dedicated unpack instructions that can handle specific blend
12465// operations: UNPCKH and UNPCKL.
12466static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12467 ArrayRef<int> Mask, SDValue V1, SDValue V2,
12468 SelectionDAG &DAG) {
12469 SmallVector<int, 8> Unpckl;
12470 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12471 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12472 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12473
12474 SmallVector<int, 8> Unpckh;
12475 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12476 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12477 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12478
12479 // Commute and try again.
12480 ShuffleVectorSDNode::commuteMask(Unpckl);
12481 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12482 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12483
12484 ShuffleVectorSDNode::commuteMask(Unpckh);
12485 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12486 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12487
12488 return SDValue();
12489}
12490
12491/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12492/// followed by unpack 256-bit.
12493static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12494 ArrayRef<int> Mask, SDValue V1,
12495 SDValue V2, SelectionDAG &DAG) {
12496 SmallVector<int, 32> Unpckl, Unpckh;
12497 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12498 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12499
12500 unsigned UnpackOpcode;
12501 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12502 UnpackOpcode = X86ISD::UNPCKL;
12503 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12504 UnpackOpcode = X86ISD::UNPCKH;
12505 else
12506 return SDValue();
12507
12508 // This is a "natural" unpack operation (rather than the 128-bit sectored
12509 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12510 // input in order to use the x86 instruction.
12511 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12512 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12513 V1 = DAG.getBitcast(VT, V1);
12514 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12515}
12516
12517// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12518// source into the lower elements and zeroing the upper elements.
12519static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12520 ArrayRef<int> Mask, const APInt &Zeroable,
12521 const X86Subtarget &Subtarget) {
12522 if (!VT.is512BitVector() && !Subtarget.hasVLX())
12523 return false;
12524
12525 unsigned NumElts = Mask.size();
12526 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12527 unsigned MaxScale = 64 / EltSizeInBits;
12528
12529 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12530 unsigned SrcEltBits = EltSizeInBits * Scale;
12531 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12532 continue;
12533 unsigned NumSrcElts = NumElts / Scale;
12534 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12535 continue;
12536 unsigned UpperElts = NumElts - NumSrcElts;
12537 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12538 continue;
12539 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12540 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12541 DstVT = MVT::getIntegerVT(EltSizeInBits);
12542 if ((NumSrcElts * EltSizeInBits) >= 128) {
12543 // ISD::TRUNCATE
12544 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12545 } else {
12546 // X86ISD::VTRUNC
12547 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12548 }
12549 return true;
12550 }
12551
12552 return false;
12553}
12554
12555// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12556// element padding to the final DstVT.
12557static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12558 const X86Subtarget &Subtarget,
12559 SelectionDAG &DAG, bool ZeroUppers) {
12560 MVT SrcVT = Src.getSimpleValueType();
12561 MVT DstSVT = DstVT.getScalarType();
12562 unsigned NumDstElts = DstVT.getVectorNumElements();
12563 unsigned NumSrcElts = SrcVT.getVectorNumElements();
12564 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12565
12566 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12567 return SDValue();
12568
12569 // Perform a direct ISD::TRUNCATE if possible.
12570 if (NumSrcElts == NumDstElts)
12571 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12572
12573 if (NumSrcElts > NumDstElts) {
12574 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12575 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12576 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12577 }
12578
12579 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12580 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12581 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12582 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12583 DstVT.getSizeInBits());
12584 }
12585
12586 // Non-VLX targets must truncate from a 512-bit type, so we need to
12587 // widen, truncate and then possibly extract the original subvector.
12588 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12589 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12590 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12591 }
12592
12593 // Fallback to a X86ISD::VTRUNC, padding if necessary.
12594 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12595 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12596 if (DstVT != TruncVT)
12597 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12598 DstVT.getSizeInBits());
12599 return Trunc;
12600}
12601
12602// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12603//
12604// An example is the following:
12605//
12606// t0: ch = EntryToken
12607// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12608// t25: v4i32 = truncate t2
12609// t41: v8i16 = bitcast t25
12610// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12611// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12612// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12613// t18: v2i64 = bitcast t51
12614//
12615// One can just use a single vpmovdw instruction, without avx512vl we need to
12616// use the zmm variant and extract the lower subvector, padding with zeroes.
12617// TODO: Merge with lowerShuffleAsVTRUNC.
12618static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12619 SDValue V2, ArrayRef<int> Mask,
12620 const APInt &Zeroable,
12621 const X86Subtarget &Subtarget,
12622 SelectionDAG &DAG) {
12623 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12623, __extension__
__PRETTY_FUNCTION__))
;
12624 if (!Subtarget.hasAVX512())
12625 return SDValue();
12626
12627 unsigned NumElts = VT.getVectorNumElements();
12628 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12629 unsigned MaxScale = 64 / EltSizeInBits;
12630 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12631 unsigned SrcEltBits = EltSizeInBits * Scale;
12632 unsigned NumSrcElts = NumElts / Scale;
12633 unsigned UpperElts = NumElts - NumSrcElts;
12634 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12635 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12636 continue;
12637
12638 // Attempt to find a matching source truncation, but as a fall back VLX
12639 // cases can use the VPMOV directly.
12640 SDValue Src = peekThroughBitcasts(V1);
12641 if (Src.getOpcode() == ISD::TRUNCATE &&
12642 Src.getScalarValueSizeInBits() == SrcEltBits) {
12643 Src = Src.getOperand(0);
12644 } else if (Subtarget.hasVLX()) {
12645 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12646 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12647 Src = DAG.getBitcast(SrcVT, Src);
12648 // Don't do this if PACKSS/PACKUS could perform it cheaper.
12649 if (Scale == 2 &&
12650 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
12651 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
12652 return SDValue();
12653 } else
12654 return SDValue();
12655
12656 // VPMOVWB is only available with avx512bw.
12657 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
12658 return SDValue();
12659
12660 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12661 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12662 }
12663
12664 return SDValue();
12665}
12666
12667// Attempt to match binary shuffle patterns as a truncate.
12668static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12669 SDValue V2, ArrayRef<int> Mask,
12670 const APInt &Zeroable,
12671 const X86Subtarget &Subtarget,
12672 SelectionDAG &DAG) {
12673 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__
__PRETTY_FUNCTION__))
12674 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__
__PRETTY_FUNCTION__))
;
12675 if (!Subtarget.hasAVX512())
12676 return SDValue();
12677
12678 unsigned NumElts = VT.getVectorNumElements();
12679 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12680 unsigned MaxScale = 64 / EltSizeInBits;
12681 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12682 // TODO: Support non-BWI VPMOVWB truncations?
12683 unsigned SrcEltBits = EltSizeInBits * Scale;
12684 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12685 continue;
12686
12687 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
12688 // Bail if the V2 elements are undef.
12689 unsigned NumHalfSrcElts = NumElts / Scale;
12690 unsigned NumSrcElts = 2 * NumHalfSrcElts;
12691 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
12692 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
12693 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12694 continue;
12695
12696 // The elements beyond the truncation must be undef/zero.
12697 unsigned UpperElts = NumElts - NumSrcElts;
12698 if (UpperElts > 0 &&
12699 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12700 continue;
12701 bool UndefUppers =
12702 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12703
12704 // For offset truncations, ensure that the concat is cheap.
12705 if (Offset) {
12706 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
12707 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12708 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12709 return Lo.getOperand(0) == Hi.getOperand(0);
12710 if (ISD::isNormalLoad(Lo.getNode()) &&
12711 ISD::isNormalLoad(Hi.getNode())) {
12712 auto *LDLo = cast<LoadSDNode>(Lo);
12713 auto *LDHi = cast<LoadSDNode>(Hi);
12714 return DAG.areNonVolatileConsecutiveLoads(
12715 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
12716 }
12717 return false;
12718 };
12719 if (!IsCheapConcat(V1, V2))
12720 continue;
12721 }
12722
12723 // As we're using both sources then we need to concat them together
12724 // and truncate from the double-sized src.
12725 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12726 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12727
12728 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12729 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12730 Src = DAG.getBitcast(SrcVT, Src);
12731
12732 // Shift the offset'd elements into place for the truncation.
12733 // TODO: Use getTargetVShiftByConstNode.
12734 if (Offset)
12735 Src = DAG.getNode(
12736 X86ISD::VSRLI, DL, SrcVT, Src,
12737 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
12738
12739 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12740 }
12741 }
12742
12743 return SDValue();
12744}
12745
12746/// Check whether a compaction lowering can be done by dropping even/odd
12747/// elements and compute how many times even/odd elements must be dropped.
12748///
12749/// This handles shuffles which take every Nth element where N is a power of
12750/// two. Example shuffle masks:
12751///
12752/// (even)
12753/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12754/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12755/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12756/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12757/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12758/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12759///
12760/// (odd)
12761/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
12762/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12763///
12764/// Any of these lanes can of course be undef.
12765///
12766/// This routine only supports N <= 3.
12767/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12768/// for larger N.
12769///
12770/// \returns N above, or the number of times even/odd elements must be dropped
12771/// if there is such a number. Otherwise returns zero.
12772static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12773 bool IsSingleInput) {
12774 // The modulus for the shuffle vector entries is based on whether this is
12775 // a single input or not.
12776 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12777 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__
__PRETTY_FUNCTION__))
12778 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__
__PRETTY_FUNCTION__))
;
12779
12780 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12781 int Offset = MatchEven ? 0 : 1;
12782
12783 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12784 // and 2^3 simultaneously. This is because we may have ambiguity with
12785 // partially undef inputs.
12786 bool ViableForN[3] = {true, true, true};
12787
12788 for (int i = 0, e = Mask.size(); i < e; ++i) {
12789 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12790 // want.
12791 if (Mask[i] < 0)
12792 continue;
12793
12794 bool IsAnyViable = false;
12795 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12796 if (ViableForN[j]) {
12797 uint64_t N = j + 1;
12798
12799 // The shuffle mask must be equal to (i * 2^N) % M.
12800 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12801 IsAnyViable = true;
12802 else
12803 ViableForN[j] = false;
12804 }
12805 // Early exit if we exhaust the possible powers of two.
12806 if (!IsAnyViable)
12807 break;
12808 }
12809
12810 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12811 if (ViableForN[j])
12812 return j + 1;
12813
12814 // Return 0 as there is no viable power of two.
12815 return 0;
12816}
12817
12818// X86 has dedicated pack instructions that can handle specific truncation
12819// operations: PACKSS and PACKUS.
12820// Checks for compaction shuffle masks if MaxStages > 1.
12821// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12822static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12823 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12824 const SelectionDAG &DAG,
12825 const X86Subtarget &Subtarget,
12826 unsigned MaxStages = 1) {
12827 unsigned NumElts = VT.getVectorNumElements();
12828 unsigned BitSize = VT.getScalarSizeInBits();
12829 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__
__PRETTY_FUNCTION__))
12830 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__
__PRETTY_FUNCTION__))
;
12831
12832 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12833 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12834 unsigned NumPackedBits = NumSrcBits - BitSize;
12835 N1 = peekThroughBitcasts(N1);
12836 N2 = peekThroughBitcasts(N2);
12837 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12838 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12839 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12840 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12841 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12842 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12843 return false;
12844 if (Subtarget.hasSSE41() || BitSize == 8) {
12845 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12846 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12847 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12848 V1 = N1;
12849 V2 = N2;
12850 SrcVT = PackVT;
12851 PackOpcode = X86ISD::PACKUS;
12852 return true;
12853 }
12854 }
12855 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12856 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12857 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12858 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12859 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12860 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12861 V1 = N1;
12862 V2 = N2;
12863 SrcVT = PackVT;
12864 PackOpcode = X86ISD::PACKSS;
12865 return true;
12866 }
12867 return false;
12868 };
12869
12870 // Attempt to match against wider and wider compaction patterns.
12871 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12872 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12873 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12874
12875 // Try binary shuffle.
12876 SmallVector<int, 32> BinaryMask;
12877 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12878 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
12879 if (MatchPACK(V1, V2, PackVT))
12880 return true;
12881
12882 // Try unary shuffle.
12883 SmallVector<int, 32> UnaryMask;
12884 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12885 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
12886 if (MatchPACK(V1, V1, PackVT))
12887 return true;
12888 }
12889
12890 return false;
12891}
12892
12893static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12894 SDValue V1, SDValue V2, SelectionDAG &DAG,
12895 const X86Subtarget &Subtarget) {
12896 MVT PackVT;
12897 unsigned PackOpcode;
12898 unsigned SizeBits = VT.getSizeInBits();
12899 unsigned EltBits = VT.getScalarSizeInBits();
12900 unsigned MaxStages = Log2_32(64 / EltBits);
12901 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12902 Subtarget, MaxStages))
12903 return SDValue();
12904
12905 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12906 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12907
12908 // Don't lower multi-stage packs on AVX512, truncation is better.
12909 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12910 return SDValue();
12911
12912 // Pack to the largest type possible:
12913 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12914 unsigned MaxPackBits = 16;
12915 if (CurrentEltBits > 16 &&
12916 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12917 MaxPackBits = 32;
12918
12919 // Repeatedly pack down to the target size.
12920 SDValue Res;
12921 for (unsigned i = 0; i != NumStages; ++i) {
12922 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12923 unsigned NumSrcElts = SizeBits / SrcEltBits;
12924 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12925 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12926 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12927 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12928 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12929 DAG.getBitcast(SrcVT, V2));
12930 V1 = V2 = Res;
12931 CurrentEltBits /= 2;
12932 }
12933 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__
__PRETTY_FUNCTION__))
12934 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__
__PRETTY_FUNCTION__))
;
12935 return Res;
12936}
12937
12938/// Try to emit a bitmask instruction for a shuffle.
12939///
12940/// This handles cases where we can model a blend exactly as a bitmask due to
12941/// one of the inputs being zeroable.
12942static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12943 SDValue V2, ArrayRef<int> Mask,
12944 const APInt &Zeroable,
12945 const X86Subtarget &Subtarget,
12946 SelectionDAG &DAG) {
12947 MVT MaskVT = VT;
12948 MVT EltVT = VT.getVectorElementType();
12949 SDValue Zero, AllOnes;
12950 // Use f64 if i64 isn't legal.
12951 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12952 EltVT = MVT::f64;
12953 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12954 }
12955
12956 MVT LogicVT = VT;
12957 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12958 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12959 APFloat AllOnesValue =
12960 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12961 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12962 LogicVT =
12963 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12964 } else {
12965 Zero = DAG.getConstant(0, DL, EltVT);
12966 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12967 }
12968
12969 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12970 SDValue V;
12971 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12972 if (Zeroable[i])
12973 continue;
12974 if (Mask[i] % Size != i)
12975 return SDValue(); // Not a blend.
12976 if (!V)
12977 V = Mask[i] < Size ? V1 : V2;
12978 else if (V != (Mask[i] < Size ? V1 : V2))
12979 return SDValue(); // Can only let one input through the mask.
12980
12981 VMaskOps[i] = AllOnes;
12982 }
12983 if (!V)
12984 return SDValue(); // No non-zeroable elements!
12985
12986 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12987 VMask = DAG.getBitcast(LogicVT, VMask);
12988 V = DAG.getBitcast(LogicVT, V);
12989 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12990 return DAG.getBitcast(VT, And);
12991}
12992
12993/// Try to emit a blend instruction for a shuffle using bit math.
12994///
12995/// This is used as a fallback approach when first class blend instructions are
12996/// unavailable. Currently it is only suitable for integer vectors, but could
12997/// be generalized for floating point vectors if desirable.
12998static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12999 SDValue V2, ArrayRef<int> Mask,
13000 SelectionDAG &DAG) {
13001 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__
__PRETTY_FUNCTION__))
;
13002 MVT EltVT = VT.getVectorElementType();
13003 SDValue Zero = DAG.getConstant(0, DL, EltVT);
13004 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
13005 SmallVector<SDValue, 16> MaskOps;
13006 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13007 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
13008 return SDValue(); // Shuffled input!
13009 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
13010 }
13011
13012 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
13013 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
13014 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
13015 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
13016}
13017
13018static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
13019 SDValue PreservedSrc,
13020 const X86Subtarget &Subtarget,
13021 SelectionDAG &DAG);
13022
13023static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
13024 MutableArrayRef<int> Mask,
13025 const APInt &Zeroable, bool &ForceV1Zero,
13026 bool &ForceV2Zero, uint64_t &BlendMask) {
13027 bool V1IsZeroOrUndef =
13028 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
13029 bool V2IsZeroOrUndef =
13030 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
13031
13032 BlendMask = 0;
13033 ForceV1Zero = false, ForceV2Zero = false;
13034 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13034, __extension__
__PRETTY_FUNCTION__))
;
13035
13036 int NumElts = Mask.size();
13037 int NumLanes = VT.getSizeInBits() / 128;
13038 int NumEltsPerLane = NumElts / NumLanes;
13039 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13039, __extension__
__PRETTY_FUNCTION__))
;
13040
13041 // For 32/64-bit elements, if we only reference one input (plus any undefs),
13042 // then ensure the blend mask part for that lane just references that input.
13043 bool ForceWholeLaneMasks =
13044 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
13045
13046 // Attempt to generate the binary blend mask. If an input is zero then
13047 // we can use any lane.
13048 for (int Lane = 0; Lane != NumLanes; ++Lane) {
13049 // Keep track of the inputs used per lane.
13050 bool LaneV1InUse = false;
13051 bool LaneV2InUse = false;
13052 uint64_t LaneBlendMask = 0;
13053 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
13054 int Elt = (Lane * NumEltsPerLane) + LaneElt;
13055 int M = Mask[Elt];
13056 if (M == SM_SentinelUndef)
13057 continue;
13058 if (M == Elt || (0 <= M && M < NumElts &&
13059 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
13060 Mask[Elt] = Elt;
13061 LaneV1InUse = true;
13062 continue;
13063 }
13064 if (M == (Elt + NumElts) ||
13065 (NumElts <= M &&
13066 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
13067 LaneBlendMask |= 1ull << LaneElt;
13068 Mask[Elt] = Elt + NumElts;
13069 LaneV2InUse = true;
13070 continue;
13071 }
13072 if (Zeroable[Elt]) {
13073 if (V1IsZeroOrUndef) {
13074 ForceV1Zero = true;
13075 Mask[Elt] = Elt;
13076 LaneV1InUse = true;
13077 continue;
13078 }
13079 if (V2IsZeroOrUndef) {
13080 ForceV2Zero = true;
13081 LaneBlendMask |= 1ull << LaneElt;
13082 Mask[Elt] = Elt + NumElts;
13083 LaneV2InUse = true;
13084 continue;
13085 }
13086 }
13087 return false;
13088 }
13089
13090 // If we only used V2 then splat the lane blend mask to avoid any demanded
13091 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
13092 // blend mask bit).
13093 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
13094 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
13095
13096 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
13097 }
13098 return true;
13099}
13100
13101static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
13102 int Scale) {
13103 uint64_t ScaledMask = 0;
13104 for (int i = 0; i != Size; ++i)
13105 if (BlendMask & (1ull << i))
13106 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
13107 return ScaledMask;
13108}
13109
13110/// Try to emit a blend instruction for a shuffle.
13111///
13112/// This doesn't do any checks for the availability of instructions for blending
13113/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
13114/// be matched in the backend with the type given. What it does check for is
13115/// that the shuffle mask is a blend, or convertible into a blend with zero.
13116static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
13117 SDValue V2, ArrayRef<int> Original,
13118 const APInt &Zeroable,
13119 const X86Subtarget &Subtarget,
13120 SelectionDAG &DAG) {
13121 uint64_t BlendMask = 0;
13122 bool ForceV1Zero = false, ForceV2Zero = false;
13123 SmallVector<int, 64> Mask(Original);
13124 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
13125 BlendMask))
13126 return SDValue();
13127
13128 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
13129 if (ForceV1Zero)
13130 V1 = getZeroVector(VT, Subtarget, DAG, DL);
13131 if (ForceV2Zero)
13132 V2 = getZeroVector(VT, Subtarget, DAG, DL);
13133
13134 unsigned NumElts = VT.getVectorNumElements();
13135
13136 switch (VT.SimpleTy) {
13137 case MVT::v4i64:
13138 case MVT::v8i32:
13139 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13139, __extension__
__PRETTY_FUNCTION__))
;
13140 [[fallthrough]];
13141 case MVT::v4f64:
13142 case MVT::v8f32:
13143 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13143, __extension__
__PRETTY_FUNCTION__))
;
13144 [[fallthrough]];
13145 case MVT::v2f64:
13146 case MVT::v2i64:
13147 case MVT::v4f32:
13148 case MVT::v4i32:
13149 case MVT::v8i16:
13150 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13150, __extension__
__PRETTY_FUNCTION__))
;
13151 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
13152 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13153 case MVT::v16i16: {
13154 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13154, __extension__
__PRETTY_FUNCTION__))
;
13155 SmallVector<int, 8> RepeatedMask;
13156 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13157 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
13158 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13158, __extension__
__PRETTY_FUNCTION__))
;
13159 BlendMask = 0;
13160 for (int i = 0; i < 8; ++i)
13161 if (RepeatedMask[i] >= 8)
13162 BlendMask |= 1ull << i;
13163 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13164 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13165 }
13166 // Use PBLENDW for lower/upper lanes and then blend lanes.
13167 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
13168 // merge to VSELECT where useful.
13169 uint64_t LoMask = BlendMask & 0xFF;
13170 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
13171 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
13172 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13173 DAG.getTargetConstant(LoMask, DL, MVT::i8));
13174 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13175 DAG.getTargetConstant(HiMask, DL, MVT::i8));
13176 return DAG.getVectorShuffle(
13177 MVT::v16i16, DL, Lo, Hi,
13178 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
13179 }
13180 [[fallthrough]];
13181 }
13182 case MVT::v32i8:
13183 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13183, __extension__
__PRETTY_FUNCTION__))
;
13184 [[fallthrough]];
13185 case MVT::v16i8: {
13186 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13186, __extension__
__PRETTY_FUNCTION__))
;
13187
13188 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
13189 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13190 Subtarget, DAG))
13191 return Masked;
13192
13193 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
13194 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13195 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13196 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13197 }
13198
13199 // If we have VPTERNLOG, we can use that as a bit blend.
13200 if (Subtarget.hasVLX())
13201 if (SDValue BitBlend =
13202 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13203 return BitBlend;
13204
13205 // Scale the blend by the number of bytes per element.
13206 int Scale = VT.getScalarSizeInBits() / 8;
13207
13208 // This form of blend is always done on bytes. Compute the byte vector
13209 // type.
13210 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13211
13212 // x86 allows load folding with blendvb from the 2nd source operand. But
13213 // we are still using LLVM select here (see comment below), so that's V1.
13214 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
13215 // allow that load-folding possibility.
13216 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
13217 ShuffleVectorSDNode::commuteMask(Mask);
13218 std::swap(V1, V2);
13219 }
13220
13221 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
13222 // mix of LLVM's code generator and the x86 backend. We tell the code
13223 // generator that boolean values in the elements of an x86 vector register
13224 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
13225 // mapping a select to operand #1, and 'false' mapping to operand #2. The
13226 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
13227 // of the element (the remaining are ignored) and 0 in that high bit would
13228 // mean operand #1 while 1 in the high bit would mean operand #2. So while
13229 // the LLVM model for boolean values in vector elements gets the relevant
13230 // bit set, it is set backwards and over constrained relative to x86's
13231 // actual model.
13232 SmallVector<SDValue, 32> VSELECTMask;
13233 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13234 for (int j = 0; j < Scale; ++j)
13235 VSELECTMask.push_back(
13236 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
13237 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
13238 MVT::i8));
13239
13240 V1 = DAG.getBitcast(BlendVT, V1);
13241 V2 = DAG.getBitcast(BlendVT, V2);
13242 return DAG.getBitcast(
13243 VT,
13244 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
13245 V1, V2));
13246 }
13247 case MVT::v16f32:
13248 case MVT::v8f64:
13249 case MVT::v8i64:
13250 case MVT::v16i32:
13251 case MVT::v32i16:
13252 case MVT::v64i8: {
13253 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
13254 bool OptForSize = DAG.shouldOptForSize();
13255 if (!OptForSize) {
13256 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13257 Subtarget, DAG))
13258 return Masked;
13259 }
13260
13261 // Otherwise load an immediate into a GPR, cast to k-register, and use a
13262 // masked move.
13263 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13264 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13265 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13266 }
13267 default:
13268 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13268)
;
13269 }
13270}
13271
13272/// Try to lower as a blend of elements from two inputs followed by
13273/// a single-input permutation.
13274///
13275/// This matches the pattern where we can blend elements from two inputs and
13276/// then reduce the shuffle to a single-input permutation.
13277static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
13278 SDValue V1, SDValue V2,
13279 ArrayRef<int> Mask,
13280 SelectionDAG &DAG,
13281 bool ImmBlends = false) {
13282 // We build up the blend mask while checking whether a blend is a viable way
13283 // to reduce the shuffle.
13284 SmallVector<int, 32> BlendMask(Mask.size(), -1);
13285 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
13286
13287 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13288 if (Mask[i] < 0)
13289 continue;
13290
13291 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13291, __extension__
__PRETTY_FUNCTION__))
;
13292
13293 if (BlendMask[Mask[i] % Size] < 0)
13294 BlendMask[Mask[i] % Size] = Mask[i];
13295 else if (BlendMask[Mask[i] % Size] != Mask[i])
13296 return SDValue(); // Can't blend in the needed input!
13297
13298 PermuteMask[i] = Mask[i] % Size;
13299 }
13300
13301 // If only immediate blends, then bail if the blend mask can't be widened to
13302 // i16.
13303 unsigned EltSize = VT.getScalarSizeInBits();
13304 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
13305 return SDValue();
13306
13307 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
13308 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
13309}
13310
13311/// Try to lower as an unpack of elements from two inputs followed by
13312/// a single-input permutation.
13313///
13314/// This matches the pattern where we can unpack elements from two inputs and
13315/// then reduce the shuffle to a single-input (wider) permutation.
13316static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
13317 SDValue V1, SDValue V2,
13318 ArrayRef<int> Mask,
13319 SelectionDAG &DAG) {
13320 int NumElts = Mask.size();
13321 int NumLanes = VT.getSizeInBits() / 128;
13322 int NumLaneElts = NumElts / NumLanes;
13323 int NumHalfLaneElts = NumLaneElts / 2;
13324
13325 bool MatchLo = true, MatchHi = true;
13326 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13327
13328 // Determine UNPCKL/UNPCKH type and operand order.
13329 for (int Elt = 0; Elt != NumElts; ++Elt) {
13330 int M = Mask[Elt];
13331 if (M < 0)
13332 continue;
13333
13334 // Normalize the mask value depending on whether it's V1 or V2.
13335 int NormM = M;
13336 SDValue &Op = Ops[Elt & 1];
13337 if (M < NumElts && (Op.isUndef() || Op == V1))
13338 Op = V1;
13339 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
13340 Op = V2;
13341 NormM -= NumElts;
13342 } else
13343 return SDValue();
13344
13345 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
13346 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
13347 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
13348 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
13349 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
13350 if (MatchLoAnyLane || MatchHiAnyLane) {
13351 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__
__PRETTY_FUNCTION__))
13352 "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__
__PRETTY_FUNCTION__))
;
13353 break;
13354 }
13355 }
13356 MatchLo &= MatchLoAnyLane;
13357 MatchHi &= MatchHiAnyLane;
13358 if (!MatchLo && !MatchHi)
13359 return SDValue();
13360 }
13361 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13361, __extension__
__PRETTY_FUNCTION__))
;
13362
13363 // Element indices have changed after unpacking. Calculate permute mask
13364 // so that they will be put back to the position as dictated by the
13365 // original shuffle mask indices.
13366 SmallVector<int, 32> PermuteMask(NumElts, -1);
13367 for (int Elt = 0; Elt != NumElts; ++Elt) {
13368 int M = Mask[Elt];
13369 if (M < 0)
13370 continue;
13371 int NormM = M;
13372 if (NumElts <= M)
13373 NormM -= NumElts;
13374 bool IsFirstOp = M < NumElts;
13375 int BaseMaskElt =
13376 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
13377 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
13378 PermuteMask[Elt] = BaseMaskElt;
13379 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
13380 PermuteMask[Elt] = BaseMaskElt + 1;
13381 assert(PermuteMask[Elt] != -1 &&(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"
) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__
__PRETTY_FUNCTION__))
13382 "Input mask element is defined but failed to assign permute mask")(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"
) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__
__PRETTY_FUNCTION__))
;
13383 }
13384
13385 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
13386 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
13387 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
13388}
13389
13390/// Try to lower a shuffle as a permute of the inputs followed by an
13391/// UNPCK instruction.
13392///
13393/// This specifically targets cases where we end up with alternating between
13394/// the two inputs, and so can permute them into something that feeds a single
13395/// UNPCK instruction. Note that this routine only targets integer vectors
13396/// because for floating point vectors we have a generalized SHUFPS lowering
13397/// strategy that handles everything that doesn't *exactly* match an unpack,
13398/// making this clever lowering unnecessary.
13399static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
13400 SDValue V1, SDValue V2,
13401 ArrayRef<int> Mask,
13402 const X86Subtarget &Subtarget,
13403 SelectionDAG &DAG) {
13404 int Size = Mask.size();
13405 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13405, __extension__
__PRETTY_FUNCTION__))
;
13406
13407 // This routine only supports 128-bit integer dual input vectors.
13408 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
13409 return SDValue();
13410
13411 int NumLoInputs =
13412 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13413 int NumHiInputs =
13414 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13415
13416 bool UnpackLo = NumLoInputs >= NumHiInputs;
13417
13418 auto TryUnpack = [&](int ScalarSize, int Scale) {
13419 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13420 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13421
13422 for (int i = 0; i < Size; ++i) {
13423 if (Mask[i] < 0)
13424 continue;
13425
13426 // Each element of the unpack contains Scale elements from this mask.
13427 int UnpackIdx = i / Scale;
13428
13429 // We only handle the case where V1 feeds the first slots of the unpack.
13430 // We rely on canonicalization to ensure this is the case.
13431 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13432 return SDValue();
13433
13434 // Setup the mask for this input. The indexing is tricky as we have to
13435 // handle the unpack stride.
13436 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13437 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13438 Mask[i] % Size;
13439 }
13440
13441 // If we will have to shuffle both inputs to use the unpack, check whether
13442 // we can just unpack first and shuffle the result. If so, skip this unpack.
13443 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13444 !isNoopShuffleMask(V2Mask))
13445 return SDValue();
13446
13447 // Shuffle the inputs into place.
13448 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13449 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13450
13451 // Cast the inputs to the type we will use to unpack them.
13452 MVT UnpackVT =
13453 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13454 V1 = DAG.getBitcast(UnpackVT, V1);
13455 V2 = DAG.getBitcast(UnpackVT, V2);
13456
13457 // Unpack the inputs and cast the result back to the desired type.
13458 return DAG.getBitcast(
13459 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13460 UnpackVT, V1, V2));
13461 };
13462
13463 // We try each unpack from the largest to the smallest to try and find one
13464 // that fits this mask.
13465 int OrigScalarSize = VT.getScalarSizeInBits();
13466 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13467 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13468 return Unpack;
13469
13470 // If we're shuffling with a zero vector then we're better off not doing
13471 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13472 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13473 ISD::isBuildVectorAllZeros(V2.getNode()))
13474 return SDValue();
13475
13476 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13477 // initial unpack.
13478 if (NumLoInputs == 0 || NumHiInputs == 0) {
13479 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__
__PRETTY_FUNCTION__))
13480 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__
__PRETTY_FUNCTION__))
;
13481 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13482
13483 // FIXME: We could consider the total complexity of the permute of each
13484 // possible unpacking. Or at the least we should consider how many
13485 // half-crossings are created.
13486 // FIXME: We could consider commuting the unpacks.
13487
13488 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13489 for (int i = 0; i < Size; ++i) {
13490 if (Mask[i] < 0)
13491 continue;
13492
13493 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13493, __extension__
__PRETTY_FUNCTION__))
;
13494
13495 PermMask[i] =
13496 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13497 }
13498 return DAG.getVectorShuffle(
13499 VT, DL,
13500 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
13501 V1, V2),
13502 DAG.getUNDEF(VT), PermMask);
13503 }
13504
13505 return SDValue();
13506}
13507
13508/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
13509/// permuting the elements of the result in place.
13510static SDValue lowerShuffleAsByteRotateAndPermute(
13511 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13512 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13513 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
13514 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
13515 (VT.is512BitVector() && !Subtarget.hasBWI()))
13516 return SDValue();
13517
13518 // We don't currently support lane crossing permutes.
13519 if (is128BitLaneCrossingShuffleMask(VT, Mask))
13520 return SDValue();
13521
13522 int Scale = VT.getScalarSizeInBits() / 8;
13523 int NumLanes = VT.getSizeInBits() / 128;
13524 int NumElts = VT.getVectorNumElements();
13525 int NumEltsPerLane = NumElts / NumLanes;
13526
13527 // Determine range of mask elts.
13528 bool Blend1 = true;
13529 bool Blend2 = true;
13530 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13531 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13532 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13533 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13534 int M = Mask[Lane + Elt];
13535 if (M < 0)
13536 continue;
13537 if (M < NumElts) {
13538 Blend1 &= (M == (Lane + Elt));
13539 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13539, __extension__
__PRETTY_FUNCTION__))
;
13540 M = M % NumEltsPerLane;
13541 Range1.first = std::min(Range1.first, M);
13542 Range1.second = std::max(Range1.second, M);
13543 } else {
13544 M -= NumElts;
13545 Blend2 &= (M == (Lane + Elt));
13546 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13546, __extension__
__PRETTY_FUNCTION__))
;
13547 M = M % NumEltsPerLane;
13548 Range2.first = std::min(Range2.first, M);
13549 Range2.second = std::max(Range2.second, M);
13550 }
13551 }
13552 }
13553
13554 // Bail if we don't need both elements.
13555 // TODO - it might be worth doing this for unary shuffles if the permute
13556 // can be widened.
13557 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
13558 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
13559 return SDValue();
13560
13561 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
13562 return SDValue();
13563
13564 // Rotate the 2 ops so we can access both ranges, then permute the result.
13565 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
13566 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13567 SDValue Rotate = DAG.getBitcast(
13568 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
13569 DAG.getBitcast(ByteVT, Lo),
13570 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
13571 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
13572 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13573 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13574 int M = Mask[Lane + Elt];
13575 if (M < 0)
13576 continue;
13577 if (M < NumElts)
13578 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
13579 else
13580 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
13581 }
13582 }
13583 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
13584 };
13585
13586 // Check if the ranges are small enough to rotate from either direction.
13587 if (Range2.second < Range1.first)
13588 return RotateAndPermute(V1, V2, Range1.first, 0);
13589 if (Range1.second < Range2.first)
13590 return RotateAndPermute(V2, V1, Range2.first, NumElts);
13591 return SDValue();
13592}
13593
13594static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
13595 return isUndefOrEqual(Mask, 0);
13596}
13597
13598static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
13599 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
13600}
13601
13602/// Check if the Mask consists of the same element repeated multiple times.
13603static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
13604 size_t NumUndefs = 0;
13605 std::optional<int> UniqueElt;
13606 for (int Elt : Mask) {
13607 if (Elt == SM_SentinelUndef) {
13608 NumUndefs++;
13609 continue;
13610 }
13611 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
13612 return false;
13613 UniqueElt = Elt;
13614 }
13615 // Make sure the element is repeated enough times by checking the number of
13616 // undefs is small.
13617 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
13618}
13619
13620/// Generic routine to decompose a shuffle and blend into independent
13621/// blends and permutes.
13622///
13623/// This matches the extremely common pattern for handling combined
13624/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
13625/// operations. It will try to pick the best arrangement of shuffles and
13626/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
13627static SDValue lowerShuffleAsDecomposedShuffleMerge(
13628 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13629 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13630 int NumElts = Mask.size();
13631 int NumLanes = VT.getSizeInBits() / 128;
13632 int NumEltsPerLane = NumElts / NumLanes;
13633
13634 // Shuffle the input elements into the desired positions in V1 and V2 and
13635 // unpack/blend them together.
13636 bool IsAlternating = true;
13637 SmallVector<int, 32> V1Mask(NumElts, -1);
13638 SmallVector<int, 32> V2Mask(NumElts, -1);
13639 SmallVector<int, 32> FinalMask(NumElts, -1);
13640 for (int i = 0; i < NumElts; ++i) {
13641 int M = Mask[i];
13642 if (M >= 0 && M < NumElts) {
13643 V1Mask[i] = M;
13644 FinalMask[i] = i;
13645 IsAlternating &= (i & 1) == 0;
13646 } else if (M >= NumElts) {
13647 V2Mask[i] = M - NumElts;
13648 FinalMask[i] = i + NumElts;
13649 IsAlternating &= (i & 1) == 1;
13650 }
13651 }
13652
13653 // If we effectively only demand the 0'th element of \p Input, and not only
13654 // as 0'th element, then broadcast said input,
13655 // and change \p InputMask to be a no-op (identity) mask.
13656 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
13657 &DAG](SDValue &Input,
13658 MutableArrayRef<int> InputMask) {
13659 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
13660 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
13661 !X86::mayFoldLoad(Input, Subtarget)))
13662 return;
13663 if (isNoopShuffleMask(InputMask))
13664 return;
13665 assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__
__PRETTY_FUNCTION__))
13666 "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__
__PRETTY_FUNCTION__))
;
13667 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13668 for (auto I : enumerate(InputMask)) {
13669 int &InputMaskElt = I.value();
13670 if (InputMaskElt >= 0)
13671 InputMaskElt = I.index();
13672 }
13673 };
13674
13675 // Currently, we may need to produce one shuffle per input, and blend results.
13676 // It is possible that the shuffle for one of the inputs is already a no-op.
13677 // See if we can simplify non-no-op shuffles into broadcasts,
13678 // which we consider to be strictly better than an arbitrary shuffle.
13679 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13680 isNoopOrBroadcastShuffleMask(V2Mask)) {
13681 canonicalizeBroadcastableInput(V1, V1Mask);
13682 canonicalizeBroadcastableInput(V2, V2Mask);
13683 }
13684
13685 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13686 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13687 // the shuffle may be able to fold with a load or other benefit. However, when
13688 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13689 // pre-shuffle first is a better strategy.
13690 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
13691 // Only prefer immediate blends to unpack/rotate.
13692 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13693 DAG, true))
13694 return BlendPerm;
13695 // If either input vector provides only a single element which is repeated
13696 // multiple times, unpacking from both input vectors would generate worse
13697 // code. e.g. for
13698 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
13699 // it is better to process t4 first to create a vector of t4[0], then unpack
13700 // that vector with t2.
13701 if (!isSingleElementRepeatedMask(V1Mask) &&
13702 !isSingleElementRepeatedMask(V2Mask))
13703 if (SDValue UnpackPerm =
13704 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
13705 return UnpackPerm;
13706 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13707 DL, VT, V1, V2, Mask, Subtarget, DAG))
13708 return RotatePerm;
13709 // Unpack/rotate failed - try again with variable blends.
13710 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13711 DAG))
13712 return BlendPerm;
13713 if (VT.getScalarSizeInBits() >= 32)
13714 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
13715 DL, VT, V1, V2, Mask, Subtarget, DAG))
13716 return PermUnpack;
13717 }
13718
13719 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13720 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13721 // TODO: It doesn't have to be alternating - but each lane mustn't have more
13722 // than half the elements coming from each source.
13723 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13724 V1Mask.assign(NumElts, -1);
13725 V2Mask.assign(NumElts, -1);
13726 FinalMask.assign(NumElts, -1);
13727 for (int i = 0; i != NumElts; i += NumEltsPerLane)
13728 for (int j = 0; j != NumEltsPerLane; ++j) {
13729 int M = Mask[i + j];
13730 if (M >= 0 && M < NumElts) {
13731 V1Mask[i + (j / 2)] = M;
13732 FinalMask[i + j] = i + (j / 2);
13733 } else if (M >= NumElts) {
13734 V2Mask[i + (j / 2)] = M - NumElts;
13735 FinalMask[i + j] = i + (j / 2) + NumElts;
13736 }
13737 }
13738 }
13739
13740 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13741 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13742 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13743}
13744
13745/// Try to lower a vector shuffle as a bit rotation.
13746///
13747/// Look for a repeated rotation pattern in each sub group.
13748/// Returns a ISD::ROTL element rotation amount or -1 if failed.
13749static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13750 int NumElts = Mask.size();
13751 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13751, __extension__
__PRETTY_FUNCTION__))
;
13752
13753 int RotateAmt = -1;
13754 for (int i = 0; i != NumElts; i += NumSubElts) {
13755 for (int j = 0; j != NumSubElts; ++j) {
13756 int M = Mask[i + j];
13757 if (M < 0)
13758 continue;
13759 if (!isInRange(M, i, i + NumSubElts))
13760 return -1;
13761 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13762 if (0 <= RotateAmt && Offset != RotateAmt)
13763 return -1;
13764 RotateAmt = Offset;
13765 }
13766 }
13767 return RotateAmt;
13768}
13769
13770static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13771 const X86Subtarget &Subtarget,
13772 ArrayRef<int> Mask) {
13773 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13773, __extension__
__PRETTY_FUNCTION__))
;
13774 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13774, __extension__
__PRETTY_FUNCTION__))
;
13775
13776 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13777 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13778 int MaxSubElts = 64 / EltSizeInBits;
13779 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13780 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13781 if (RotateAmt < 0)
13782 continue;
13783
13784 int NumElts = Mask.size();
13785 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13786 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13787 return RotateAmt * EltSizeInBits;
13788 }
13789
13790 return -1;
13791}
13792
13793/// Lower shuffle using X86ISD::VROTLI rotations.
13794static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13795 ArrayRef<int> Mask,
13796 const X86Subtarget &Subtarget,
13797 SelectionDAG &DAG) {
13798 // Only XOP + AVX512 targets have bit rotation instructions.
13799 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13800 bool IsLegal =
13801 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13802 if (!IsLegal && Subtarget.hasSSE3())
13803 return SDValue();
13804
13805 MVT RotateVT;
13806 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13807 Subtarget, Mask);
13808 if (RotateAmt < 0)
13809 return SDValue();
13810
13811 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13812 // expanded to OR(SRL,SHL), will be more efficient, but if they can
13813 // widen to vXi16 or more then existing lowering should will be better.
13814 if (!IsLegal) {
13815 if ((RotateAmt % 16) == 0)
13816 return SDValue();
13817 // TODO: Use getTargetVShiftByConstNode.
13818 unsigned ShlAmt = RotateAmt;
13819 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13820 V1 = DAG.getBitcast(RotateVT, V1);
13821 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13822 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13823 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13824 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13825 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13826 return DAG.getBitcast(VT, Rot);
13827 }
13828
13829 SDValue Rot =
13830 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13831 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13832 return DAG.getBitcast(VT, Rot);
13833}
13834
13835/// Try to match a vector shuffle as an element rotation.
13836///
13837/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13838static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13839 ArrayRef<int> Mask) {
13840 int NumElts = Mask.size();
13841
13842 // We need to detect various ways of spelling a rotation:
13843 // [11, 12, 13, 14, 15, 0, 1, 2]
13844 // [-1, 12, 13, 14, -1, -1, 1, -1]
13845 // [-1, -1, -1, -1, -1, -1, 1, 2]
13846 // [ 3, 4, 5, 6, 7, 8, 9, 10]
13847 // [-1, 4, 5, 6, -1, -1, 9, -1]
13848 // [-1, 4, 5, 6, -1, -1, -1, -1]
13849 int Rotation = 0;
13850 SDValue Lo, Hi;
13851 for (int i = 0; i < NumElts; ++i) {
13852 int M = Mask[i];
13853 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__
__PRETTY_FUNCTION__))
13854 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__
__PRETTY_FUNCTION__))
;
13855 if (M < 0)
13856 continue;
13857
13858 // Determine where a rotated vector would have started.
13859 int StartIdx = i - (M % NumElts);
13860 if (StartIdx == 0)
13861 // The identity rotation isn't interesting, stop.
13862 return -1;
13863
13864 // If we found the tail of a vector the rotation must be the missing
13865 // front. If we found the head of a vector, it must be how much of the
13866 // head.
13867 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13868
13869 if (Rotation == 0)
13870 Rotation = CandidateRotation;
13871 else if (Rotation != CandidateRotation)
13872 // The rotations don't match, so we can't match this mask.
13873 return -1;
13874
13875 // Compute which value this mask is pointing at.
13876 SDValue MaskV = M < NumElts ? V1 : V2;
13877
13878 // Compute which of the two target values this index should be assigned
13879 // to. This reflects whether the high elements are remaining or the low
13880 // elements are remaining.
13881 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13882
13883 // Either set up this value if we've not encountered it before, or check
13884 // that it remains consistent.
13885 if (!TargetV)
13886 TargetV = MaskV;
13887 else if (TargetV != MaskV)
13888 // This may be a rotation, but it pulls from the inputs in some
13889 // unsupported interleaving.
13890 return -1;
13891 }
13892
13893 // Check that we successfully analyzed the mask, and normalize the results.
13894 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13894, __extension__
__PRETTY_FUNCTION__))
;
13895 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__
__PRETTY_FUNCTION__))
;
13896 if (!Lo)
13897 Lo = Hi;
13898 else if (!Hi)
13899 Hi = Lo;
13900
13901 V1 = Lo;
13902 V2 = Hi;
13903
13904 return Rotation;
13905}
13906
13907/// Try to lower a vector shuffle as a byte rotation.
13908///
13909/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13910/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13911/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13912/// try to generically lower a vector shuffle through such an pattern. It
13913/// does not check for the profitability of lowering either as PALIGNR or
13914/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13915/// This matches shuffle vectors that look like:
13916///
13917/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13918///
13919/// Essentially it concatenates V1 and V2, shifts right by some number of
13920/// elements, and takes the low elements as the result. Note that while this is
13921/// specified as a *right shift* because x86 is little-endian, it is a *left
13922/// rotate* of the vector lanes.
13923static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13924 ArrayRef<int> Mask) {
13925 // Don't accept any shuffles with zero elements.
13926 if (isAnyZero(Mask))
13927 return -1;
13928
13929 // PALIGNR works on 128-bit lanes.
13930 SmallVector<int, 16> RepeatedMask;
13931 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13932 return -1;
13933
13934 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13935 if (Rotation <= 0)
13936 return -1;
13937
13938 // PALIGNR rotates bytes, so we need to scale the
13939 // rotation based on how many bytes are in the vector lane.
13940 int NumElts = RepeatedMask.size();
13941 int Scale = 16 / NumElts;
13942 return Rotation * Scale;
13943}
13944
13945static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13946 SDValue V2, ArrayRef<int> Mask,
13947 const X86Subtarget &Subtarget,
13948 SelectionDAG &DAG) {
13949 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13949, __extension__
__PRETTY_FUNCTION__))
;
13950
13951 SDValue Lo = V1, Hi = V2;
13952 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13953 if (ByteRotation <= 0)
13954 return SDValue();
13955
13956 // Cast the inputs to i8 vector of correct length to match PALIGNR or
13957 // PSLLDQ/PSRLDQ.
13958 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13959 Lo = DAG.getBitcast(ByteVT, Lo);
13960 Hi = DAG.getBitcast(ByteVT, Hi);
13961
13962 // SSSE3 targets can use the palignr instruction.
13963 if (Subtarget.hasSSSE3()) {
13964 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))
13965 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))
;
13966 return DAG.getBitcast(
13967 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13968 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13969 }
13970
13971 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__
__PRETTY_FUNCTION__))
13972 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__
__PRETTY_FUNCTION__))
;
13973 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__
__PRETTY_FUNCTION__))
13974 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__
__PRETTY_FUNCTION__))
;
13975 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__
__PRETTY_FUNCTION__))
13976 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__
__PRETTY_FUNCTION__))
;
13977
13978 // Default SSE2 implementation
13979 int LoByteShift = 16 - ByteRotation;
13980 int HiByteShift = ByteRotation;
13981
13982 SDValue LoShift =
13983 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13984 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13985 SDValue HiShift =
13986 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13987 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13988 return DAG.getBitcast(VT,
13989 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13990}
13991
13992/// Try to lower a vector shuffle as a dword/qword rotation.
13993///
13994/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13995/// rotation of the concatenation of two vectors; This routine will
13996/// try to generically lower a vector shuffle through such an pattern.
13997///
13998/// Essentially it concatenates V1 and V2, shifts right by some number of
13999/// elements, and takes the low elements as the result. Note that while this is
14000/// specified as a *right shift* because x86 is little-endian, it is a *left
14001/// rotate* of the vector lanes.
14002static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
14003 SDValue V2, ArrayRef<int> Mask,
14004 const X86Subtarget &Subtarget,
14005 SelectionDAG &DAG) {
14006 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__
__PRETTY_FUNCTION__))
14007 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__
__PRETTY_FUNCTION__))
;
14008
14009 // 128/256-bit vectors are only supported with VLX.
14010 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__
__PRETTY_FUNCTION__))
14011 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__
__PRETTY_FUNCTION__))
;
14012
14013 SDValue Lo = V1, Hi = V2;
14014 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
14015 if (Rotation <= 0)
14016 return SDValue();
14017
14018 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
14019 DAG.getTargetConstant(Rotation, DL, MVT::i8));
14020}
14021
14022/// Try to lower a vector shuffle as a byte shift sequence.
14023static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
14024 SDValue V2, ArrayRef<int> Mask,
14025 const APInt &Zeroable,
14026 const X86Subtarget &Subtarget,
14027 SelectionDAG &DAG) {
14028 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14028, __extension__
__PRETTY_FUNCTION__))
;
14029 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14029, __extension__
__PRETTY_FUNCTION__))
;
14030
14031 // We need a shuffle that has zeros at one/both ends and a sequential
14032 // shuffle from one source within.
14033 unsigned ZeroLo = Zeroable.countr_one();
14034 unsigned ZeroHi = Zeroable.countl_one();
14035 if (!ZeroLo && !ZeroHi)
14036 return SDValue();
14037
14038 unsigned NumElts = Mask.size();
14039 unsigned Len = NumElts - (ZeroLo + ZeroHi);
14040 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
14041 return SDValue();
14042
14043 unsigned Scale = VT.getScalarSizeInBits() / 8;
14044 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
14045 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
14046 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
14047 return SDValue();
14048
14049 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
14050 Res = DAG.getBitcast(MVT::v16i8, Res);
14051
14052 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
14053 // inner sequential set of elements, possibly offset:
14054 // 01234567 --> zzzzzz01 --> 1zzzzzzz
14055 // 01234567 --> 4567zzzz --> zzzzz456
14056 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
14057 if (ZeroLo == 0) {
14058 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14059 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14060 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14061 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14062 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
14063 } else if (ZeroHi == 0) {
14064 unsigned Shift = Mask[ZeroLo] % NumElts;
14065 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14066 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14067 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14068 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14069 } else if (!Subtarget.hasSSSE3()) {
14070 // If we don't have PSHUFB then its worth avoiding an AND constant mask
14071 // by performing 3 byte shifts. Shuffle combining can kick in above that.
14072 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
14073 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14074 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14075 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14076 Shift += Mask[ZeroLo] % NumElts;
14077 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14078 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14079 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14080 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14081 } else
14082 return SDValue();
14083
14084 return DAG.getBitcast(VT, Res);
14085}
14086
14087/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
14088///
14089/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
14090/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
14091/// matches elements from one of the input vectors shuffled to the left or
14092/// right with zeroable elements 'shifted in'. It handles both the strictly
14093/// bit-wise element shifts and the byte shift across an entire 128-bit double
14094/// quad word lane.
14095///
14096/// PSHL : (little-endian) left bit shift.
14097/// [ zz, 0, zz, 2 ]
14098/// [ -1, 4, zz, -1 ]
14099/// PSRL : (little-endian) right bit shift.
14100/// [ 1, zz, 3, zz]
14101/// [ -1, -1, 7, zz]
14102/// PSLLDQ : (little-endian) left byte shift
14103/// [ zz, 0, 1, 2, 3, 4, 5, 6]
14104/// [ zz, zz, -1, -1, 2, 3, 4, -1]
14105/// [ zz, zz, zz, zz, zz, zz, -1, 1]
14106/// PSRLDQ : (little-endian) right byte shift
14107/// [ 5, 6, 7, zz, zz, zz, zz, zz]
14108/// [ -1, 5, 6, 7, zz, zz, zz, zz]
14109/// [ 1, 2, -1, -1, -1, -1, zz, zz]
14110static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
14111 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
14112 int MaskOffset, const APInt &Zeroable,
14113 const X86Subtarget &Subtarget) {
14114 int Size = Mask.size();
14115 unsigned SizeInBits = Size * ScalarSizeInBits;
14116
14117 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
14118 for (int i = 0; i < Size; i += Scale)
14119 for (int j = 0; j < Shift; ++j)
14120 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
14121 return false;
14122
14123 return true;
14124 };
14125
14126 auto MatchShift = [&](int Shift, int Scale, bool Left) {
14127 for (int i = 0; i != Size; i += Scale) {
14128 unsigned Pos = Left ? i + Shift : i;
14129 unsigned Low = Left ? i : i + Shift;
14130 unsigned Len = Scale - Shift;
14131 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
14132 return -1;
14133 }
14134
14135 int ShiftEltBits = ScalarSizeInBits * Scale;
14136 bool ByteShift = ShiftEltBits > 64;
14137 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
14138 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
14139 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
14140
14141 // Normalize the scale for byte shifts to still produce an i64 element
14142 // type.
14143 Scale = ByteShift ? Scale / 2 : Scale;
14144
14145 // We need to round trip through the appropriate type for the shift.
14146 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
14147 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
14148 : MVT::getVectorVT(ShiftSVT, Size / Scale);
14149 return (int)ShiftAmt;
14150 };
14151
14152 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
14153 // keep doubling the size of the integer elements up to that. We can
14154 // then shift the elements of the integer vector by whole multiples of
14155 // their width within the elements of the larger integer vector. Test each
14156 // multiple to see if we can find a match with the moved element indices
14157 // and that the shifted in elements are all zeroable.
14158 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
14159 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
14160 for (int Shift = 1; Shift != Scale; ++Shift)
14161 for (bool Left : {true, false})
14162 if (CheckZeros(Shift, Scale, Left)) {
14163 int ShiftAmt = MatchShift(Shift, Scale, Left);
14164 if (0 < ShiftAmt)
14165 return ShiftAmt;
14166 }
14167
14168 // no match
14169 return -1;
14170}
14171
14172static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
14173 SDValue V2, ArrayRef<int> Mask,
14174 const APInt &Zeroable,
14175 const X86Subtarget &Subtarget,
14176 SelectionDAG &DAG, bool BitwiseOnly) {
14177 int Size = Mask.size();
14178 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14178, __extension__
__PRETTY_FUNCTION__))
;
14179
14180 MVT ShiftVT;
14181 SDValue V = V1;
14182 unsigned Opcode;
14183
14184 // Try to match shuffle against V1 shift.
14185 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14186 Mask, 0, Zeroable, Subtarget);
14187
14188 // If V1 failed, try to match shuffle against V2 shift.
14189 if (ShiftAmt < 0) {
14190 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14191 Mask, Size, Zeroable, Subtarget);
14192 V = V2;
14193 }
14194
14195 if (ShiftAmt < 0)
14196 return SDValue();
14197
14198 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
14199 return SDValue();
14200
14201 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__
__PRETTY_FUNCTION__))
14202 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__
__PRETTY_FUNCTION__))
;
14203 V = DAG.getBitcast(ShiftVT, V);
14204 V = DAG.getNode(Opcode, DL, ShiftVT, V,
14205 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
14206 return DAG.getBitcast(VT, V);
14207}
14208
14209// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
14210// Remainder of lower half result is zero and upper half is all undef.
14211static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
14212 ArrayRef<int> Mask, uint64_t &BitLen,
14213 uint64_t &BitIdx, const APInt &Zeroable) {
14214 int Size = Mask.size();
14215 int HalfSize = Size / 2;
14216 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14216, __extension__
__PRETTY_FUNCTION__))
;
14217 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14217, __extension__
__PRETTY_FUNCTION__))
;
14218
14219 // Upper half must be undefined.
14220 if (!isUndefUpperHalf(Mask))
14221 return false;
14222
14223 // Determine the extraction length from the part of the
14224 // lower half that isn't zeroable.
14225 int Len = HalfSize;
14226 for (; Len > 0; --Len)
14227 if (!Zeroable[Len - 1])
14228 break;
14229 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14229, __extension__
__PRETTY_FUNCTION__))
;
14230
14231 // Attempt to match first Len sequential elements from the lower half.
14232 SDValue Src;
14233 int Idx = -1;
14234 for (int i = 0; i != Len; ++i) {
14235 int M = Mask[i];
14236 if (M == SM_SentinelUndef)
14237 continue;
14238 SDValue &V = (M < Size ? V1 : V2);
14239 M = M % Size;
14240
14241 // The extracted elements must start at a valid index and all mask
14242 // elements must be in the lower half.
14243 if (i > M || M >= HalfSize)
14244 return false;
14245
14246 if (Idx < 0 || (Src == V && Idx == (M - i))) {
14247 Src = V;
14248 Idx = M - i;
14249 continue;
14250 }
14251 return false;
14252 }
14253
14254 if (!Src || Idx < 0)
14255 return false;
14256
14257 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14257, __extension__
__PRETTY_FUNCTION__))
;
14258 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14259 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14260 V1 = Src;
14261 return true;
14262}
14263
14264// INSERTQ: Extract lowest Len elements from lower half of second source and
14265// insert over first source, starting at Idx.
14266// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
14267static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
14268 ArrayRef<int> Mask, uint64_t &BitLen,
14269 uint64_t &BitIdx) {
14270 int Size = Mask.size();
14271 int HalfSize = Size / 2;
14272 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14272, __extension__
__PRETTY_FUNCTION__))
;
14273
14274 // Upper half must be undefined.
14275 if (!isUndefUpperHalf(Mask))
14276 return false;
14277
14278 for (int Idx = 0; Idx != HalfSize; ++Idx) {
14279 SDValue Base;
14280
14281 // Attempt to match first source from mask before insertion point.
14282 if (isUndefInRange(Mask, 0, Idx)) {
14283 /* EMPTY */
14284 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
14285 Base = V1;
14286 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
14287 Base = V2;
14288 } else {
14289 continue;
14290 }
14291
14292 // Extend the extraction length looking to match both the insertion of
14293 // the second source and the remaining elements of the first.
14294 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
14295 SDValue Insert;
14296 int Len = Hi - Idx;
14297
14298 // Match insertion.
14299 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
14300 Insert = V1;
14301 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
14302 Insert = V2;
14303 } else {
14304 continue;
14305 }
14306
14307 // Match the remaining elements of the lower half.
14308 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
14309 /* EMPTY */
14310 } else if ((!Base || (Base == V1)) &&
14311 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
14312 Base = V1;
14313 } else if ((!Base || (Base == V2)) &&
14314 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
14315 Size + Hi)) {
14316 Base = V2;
14317 } else {
14318 continue;
14319 }
14320
14321 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14322 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14323 V1 = Base;
14324 V2 = Insert;
14325 return true;
14326 }
14327 }
14328
14329 return false;
14330}
14331
14332/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
14333static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
14334 SDValue V2, ArrayRef<int> Mask,
14335 const APInt &Zeroable, SelectionDAG &DAG) {
14336 uint64_t BitLen, BitIdx;
14337 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
14338 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
14339 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14340 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14341
14342 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
14343 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
14344 V2 ? V2 : DAG.getUNDEF(VT),
14345 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14346 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14347
14348 return SDValue();
14349}
14350
14351/// Lower a vector shuffle as a zero or any extension.
14352///
14353/// Given a specific number of elements, element bit width, and extension
14354/// stride, produce either a zero or any extension based on the available
14355/// features of the subtarget. The extended elements are consecutive and
14356/// begin and can start from an offsetted element index in the input; to
14357/// avoid excess shuffling the offset must either being in the bottom lane
14358/// or at the start of a higher lane. All extended elements must be from
14359/// the same lane.
14360static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
14361 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
14362 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14363 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__
__PRETTY_FUNCTION__))
;
14364 int EltBits = VT.getScalarSizeInBits();
14365 int NumElements = VT.getVectorNumElements();
14366 int NumEltsPerLane = 128 / EltBits;
14367 int OffsetLane = Offset / NumEltsPerLane;
14368 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__
__PRETTY_FUNCTION__))
14369 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__
__PRETTY_FUNCTION__))
;
14370 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14370, __extension__
__PRETTY_FUNCTION__))
;
14371 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14371, __extension__
__PRETTY_FUNCTION__))
;
14372 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__
__PRETTY_FUNCTION__))
14373 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__
__PRETTY_FUNCTION__))
;
14374
14375 // Check that an index is in same lane as the base offset.
14376 auto SafeOffset = [&](int Idx) {
14377 return OffsetLane == (Idx / NumEltsPerLane);
14378 };
14379
14380 // Shift along an input so that the offset base moves to the first element.
14381 auto ShuffleOffset = [&](SDValue V) {
14382 if (!Offset)
14383 return V;
14384
14385 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14386 for (int i = 0; i * Scale < NumElements; ++i) {
14387 int SrcIdx = i + Offset;
14388 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
14389 }
14390 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
14391 };
14392
14393 // Found a valid a/zext mask! Try various lowering strategies based on the
14394 // input type and available ISA extensions.
14395 if (Subtarget.hasSSE41()) {
14396 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
14397 // PUNPCK will catch this in a later shuffle match.
14398 if (Offset && Scale == 2 && VT.is128BitVector())
14399 return SDValue();
14400 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
14401 NumElements / Scale);
14402 InputV = DAG.getBitcast(VT, InputV);
14403 InputV = ShuffleOffset(InputV);
14404 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
14405 DL, ExtVT, InputV, DAG);
14406 return DAG.getBitcast(VT, InputV);
14407 }
14408
14409 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14409, __extension__
__PRETTY_FUNCTION__))
;
14410 InputV = DAG.getBitcast(VT, InputV);
14411
14412 // For any extends we can cheat for larger element sizes and use shuffle
14413 // instructions that can fold with a load and/or copy.
14414 if (AnyExt && EltBits == 32) {
14415 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
14416 -1};
14417 return DAG.getBitcast(
14418 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14419 DAG.getBitcast(MVT::v4i32, InputV),
14420 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14421 }
14422 if (AnyExt && EltBits == 16 && Scale > 2) {
14423 int PSHUFDMask[4] = {Offset / 2, -1,
14424 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
14425 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14426 DAG.getBitcast(MVT::v4i32, InputV),
14427 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14428 int PSHUFWMask[4] = {1, -1, -1, -1};
14429 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
14430 return DAG.getBitcast(
14431 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
14432 DAG.getBitcast(MVT::v8i16, InputV),
14433 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
14434 }
14435
14436 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
14437 // to 64-bits.
14438 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
14439 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14439, __extension__
__PRETTY_FUNCTION__))
;
14440 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14440, __extension__
__PRETTY_FUNCTION__))
;
14441
14442 int LoIdx = Offset * EltBits;
14443 SDValue Lo = DAG.getBitcast(
14444 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14445 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14446 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
14447
14448 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
14449 return DAG.getBitcast(VT, Lo);
14450
14451 int HiIdx = (Offset + 1) * EltBits;
14452 SDValue Hi = DAG.getBitcast(
14453 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14454 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14455 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
14456 return DAG.getBitcast(VT,
14457 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
14458 }
14459
14460 // If this would require more than 2 unpack instructions to expand, use
14461 // pshufb when available. We can only use more than 2 unpack instructions
14462 // when zero extending i8 elements which also makes it easier to use pshufb.
14463 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
14464 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14464, __extension__
__PRETTY_FUNCTION__))
;
14465 SDValue PSHUFBMask[16];
14466 for (int i = 0; i < 16; ++i) {
14467 int Idx = Offset + (i / Scale);
14468 if ((i % Scale == 0 && SafeOffset(Idx))) {
14469 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
14470 continue;
14471 }
14472 PSHUFBMask[i] =
14473 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
14474 }
14475 InputV = DAG.getBitcast(MVT::v16i8, InputV);
14476 return DAG.getBitcast(
14477 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
14478 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
14479 }
14480
14481 // If we are extending from an offset, ensure we start on a boundary that
14482 // we can unpack from.
14483 int AlignToUnpack = Offset % (NumElements / Scale);
14484 if (AlignToUnpack) {
14485 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14486 for (int i = AlignToUnpack; i < NumElements; ++i)
14487 ShMask[i - AlignToUnpack] = i;
14488 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
14489 Offset -= AlignToUnpack;
14490 }
14491
14492 // Otherwise emit a sequence of unpacks.
14493 do {
14494 unsigned UnpackLoHi = X86ISD::UNPCKL;
14495 if (Offset >= (NumElements / 2)) {
14496 UnpackLoHi = X86ISD::UNPCKH;
14497 Offset -= (NumElements / 2);
14498 }
14499
14500 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
14501 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
14502 : getZeroVector(InputVT, Subtarget, DAG, DL);
14503 InputV = DAG.getBitcast(InputVT, InputV);
14504 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
14505 Scale /= 2;
14506 EltBits *= 2;
14507 NumElements /= 2;
14508 } while (Scale > 1);
14509 return DAG.getBitcast(VT, InputV);
14510}
14511
14512/// Try to lower a vector shuffle as a zero extension on any microarch.
14513///
14514/// This routine will try to do everything in its power to cleverly lower
14515/// a shuffle which happens to match the pattern of a zero extend. It doesn't
14516/// check for the profitability of this lowering, it tries to aggressively
14517/// match this pattern. It will use all of the micro-architectural details it
14518/// can to emit an efficient lowering. It handles both blends with all-zero
14519/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
14520/// masking out later).
14521///
14522/// The reason we have dedicated lowering for zext-style shuffles is that they
14523/// are both incredibly common and often quite performance sensitive.
14524static SDValue lowerShuffleAsZeroOrAnyExtend(
14525 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14526 const APInt &Zeroable, const X86Subtarget &Subtarget,
14527 SelectionDAG &DAG) {
14528 int Bits = VT.getSizeInBits();
14529 int NumLanes = Bits / 128;
14530 int NumElements = VT.getVectorNumElements();
14531 int NumEltsPerLane = NumElements / NumLanes;
14532 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__
__PRETTY_FUNCTION__))
14533 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__
__PRETTY_FUNCTION__))
;
14534 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14534, __extension__
__PRETTY_FUNCTION__))
;
14535
14536 // Define a helper function to check a particular ext-scale and lower to it if
14537 // valid.
14538 auto Lower = [&](int Scale) -> SDValue {
14539 SDValue InputV;
14540 bool AnyExt = true;
14541 int Offset = 0;
14542 int Matches = 0;
14543 for (int i = 0; i < NumElements; ++i) {
14544 int M = Mask[i];
14545 if (M < 0)
14546 continue; // Valid anywhere but doesn't tell us anything.
14547 if (i % Scale != 0) {
14548 // Each of the extended elements need to be zeroable.
14549 if (!Zeroable[i])
14550 return SDValue();
14551
14552 // We no longer are in the anyext case.
14553 AnyExt = false;
14554 continue;
14555 }
14556
14557 // Each of the base elements needs to be consecutive indices into the
14558 // same input vector.
14559 SDValue V = M < NumElements ? V1 : V2;
14560 M = M % NumElements;
14561 if (!InputV) {
14562 InputV = V;
14563 Offset = M - (i / Scale);
14564 } else if (InputV != V)
14565 return SDValue(); // Flip-flopping inputs.
14566
14567 // Offset must start in the lowest 128-bit lane or at the start of an
14568 // upper lane.
14569 // FIXME: Is it ever worth allowing a negative base offset?
14570 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
14571 (Offset % NumEltsPerLane) == 0))
14572 return SDValue();
14573
14574 // If we are offsetting, all referenced entries must come from the same
14575 // lane.
14576 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
14577 return SDValue();
14578
14579 if ((M % NumElements) != (Offset + (i / Scale)))
14580 return SDValue(); // Non-consecutive strided elements.
14581 Matches++;
14582 }
14583
14584 // If we fail to find an input, we have a zero-shuffle which should always
14585 // have already been handled.
14586 // FIXME: Maybe handle this here in case during blending we end up with one?
14587 if (!InputV)
14588 return SDValue();
14589
14590 // If we are offsetting, don't extend if we only match a single input, we
14591 // can always do better by using a basic PSHUF or PUNPCK.
14592 if (Offset != 0 && Matches < 2)
14593 return SDValue();
14594
14595 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
14596 InputV, Mask, Subtarget, DAG);
14597 };
14598
14599 // The widest scale possible for extending is to a 64-bit integer.
14600 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__
__PRETTY_FUNCTION__))
14601 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__
__PRETTY_FUNCTION__))
;
14602 int NumExtElements = Bits / 64;
14603
14604 // Each iteration, try extending the elements half as much, but into twice as
14605 // many elements.
14606 for (; NumExtElements < NumElements; NumExtElements *= 2) {
14607 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__
__PRETTY_FUNCTION__))
14608 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__
__PRETTY_FUNCTION__))
;
14609 if (SDValue V = Lower(NumElements / NumExtElements))
14610 return V;
14611 }
14612
14613 // General extends failed, but 128-bit vectors may be able to use MOVQ.
14614 if (Bits != 128)
14615 return SDValue();
14616
14617 // Returns one of the source operands if the shuffle can be reduced to a
14618 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
14619 auto CanZExtLowHalf = [&]() {
14620 for (int i = NumElements / 2; i != NumElements; ++i)
14621 if (!Zeroable[i])
14622 return SDValue();
14623 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
14624 return V1;
14625 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
14626 return V2;
14627 return SDValue();
14628 };
14629
14630 if (SDValue V = CanZExtLowHalf()) {
14631 V = DAG.getBitcast(MVT::v2i64, V);
14632 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
14633 return DAG.getBitcast(VT, V);
14634 }
14635
14636 // No viable ext lowering found.
14637 return SDValue();
14638}
14639
14640/// Try to get a scalar value for a specific element of a vector.
14641///
14642/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
14643static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
14644 SelectionDAG &DAG) {
14645 MVT VT = V.getSimpleValueType();
14646 MVT EltVT = VT.getVectorElementType();
14647 V = peekThroughBitcasts(V);
14648
14649 // If the bitcasts shift the element size, we can't extract an equivalent
14650 // element from it.
14651 MVT NewVT = V.getSimpleValueType();
14652 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
14653 return SDValue();
14654
14655 if (V.getOpcode() == ISD::BUILD_VECTOR ||
14656 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
14657 // Ensure the scalar operand is the same size as the destination.
14658 // FIXME: Add support for scalar truncation where possible.
14659 SDValue S = V.getOperand(Idx);
14660 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
14661 return DAG.getBitcast(EltVT, S);
14662 }
14663
14664 return SDValue();
14665}
14666
14667/// Helper to test for a load that can be folded with x86 shuffles.
14668///
14669/// This is particularly important because the set of instructions varies
14670/// significantly based on whether the operand is a load or not.
14671static bool isShuffleFoldableLoad(SDValue V) {
14672 return V->hasOneUse() &&
14673 ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
14674}
14675
14676template<typename T>
14677static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
14678 return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
14679}
14680
14681template<typename T>
14682bool X86TargetLowering::isSoftFP16(T VT) const {
14683 return ::isSoftFP16(VT, Subtarget);
14684}
14685
14686/// Try to lower insertion of a single element into a zero vector.
14687///
14688/// This is a common pattern that we have especially efficient patterns to lower
14689/// across all subtarget feature sets.
14690static SDValue lowerShuffleAsElementInsertion(
14691 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14692 const APInt &Zeroable, const X86Subtarget &Subtarget,
14693 SelectionDAG &DAG) {
14694 MVT ExtVT = VT;
14695 MVT EltVT = VT.getVectorElementType();
14696 unsigned NumElts = VT.getVectorNumElements();
14697 unsigned EltBits = VT.getScalarSizeInBits();
14698
14699 if (isSoftFP16(EltVT, Subtarget))
14700 return SDValue();
14701
14702 int V2Index =
14703 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14704 Mask.begin();
14705 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
14706 bool IsV1Zeroable = true;
14707 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14708 if (i != V2Index && !Zeroable[i]) {
14709 IsV1Zeroable = false;
14710 break;
14711 }
14712
14713 // Bail if a non-zero V1 isn't used in place.
14714 if (!IsV1Zeroable) {
14715 SmallVector<int, 8> V1Mask(Mask);
14716 V1Mask[V2Index] = -1;
14717 if (!isNoopShuffleMask(V1Mask))
14718 return SDValue();
14719 }
14720
14721 // Check for a single input from a SCALAR_TO_VECTOR node.
14722 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14723 // all the smarts here sunk into that routine. However, the current
14724 // lowering of BUILD_VECTOR makes that nearly impossible until the old
14725 // vector shuffle lowering is dead.
14726 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14727 DAG);
14728 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14729 // We need to zext the scalar if it is smaller than an i32.
14730 V2S = DAG.getBitcast(EltVT, V2S);
14731 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14732 // Using zext to expand a narrow element won't work for non-zero
14733 // insertions. But we can use a masked constant vector if we're
14734 // inserting V2 into the bottom of V1.
14735 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
14736 return SDValue();
14737
14738 // Zero-extend directly to i32.
14739 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14740 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14741
14742 // If we're inserting into a constant, mask off the inserted index
14743 // and OR with the zero-extended scalar.
14744 if (!IsV1Zeroable) {
14745 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
14746 Bits[V2Index] = APInt::getZero(EltBits);
14747 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
14748 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
14749 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14750 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
14751 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
14752 }
14753 }
14754 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14755 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14756 EltVT == MVT::i16) {
14757 // Either not inserting from the low element of the input or the input
14758 // element size is too small to use VZEXT_MOVL to clear the high bits.
14759 return SDValue();
14760 }
14761
14762 if (!IsV1Zeroable) {
14763 // If V1 can't be treated as a zero vector we have fewer options to lower
14764 // this. We can't support integer vectors or non-zero targets cheaply.
14765 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14765, __extension__
__PRETTY_FUNCTION__))
;
14766 if (!VT.isFloatingPoint() || V2Index != 0)
14767 return SDValue();
14768 if (!VT.is128BitVector())
14769 return SDValue();
14770
14771 // Otherwise, use MOVSD, MOVSS or MOVSH.
14772 unsigned MovOpc = 0;
14773 if (EltVT == MVT::f16)
14774 MovOpc = X86ISD::MOVSH;
14775 else if (EltVT == MVT::f32)
14776 MovOpc = X86ISD::MOVSS;
14777 else if (EltVT == MVT::f64)
14778 MovOpc = X86ISD::MOVSD;
14779 else
14780 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14780)
;
14781 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14782 }
14783
14784 // This lowering only works for the low element with floating point vectors.
14785 if (VT.isFloatingPoint() && V2Index != 0)
14786 return SDValue();
14787
14788 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14789 if (ExtVT != VT)
14790 V2 = DAG.getBitcast(VT, V2);
14791
14792 if (V2Index != 0) {
14793 // If we have 4 or fewer lanes we can cheaply shuffle the element into
14794 // the desired position. Otherwise it is more efficient to do a vector
14795 // shift left. We know that we can do a vector shift left because all
14796 // the inputs are zero.
14797 if (VT.isFloatingPoint() || NumElts <= 4) {
14798 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14799 V2Shuffle[V2Index] = 0;
14800 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14801 } else {
14802 V2 = DAG.getBitcast(MVT::v16i8, V2);
14803 V2 = DAG.getNode(
14804 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14805 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
14806 V2 = DAG.getBitcast(VT, V2);
14807 }
14808 }
14809 return V2;
14810}
14811
14812/// Try to lower broadcast of a single - truncated - integer element,
14813/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14814///
14815/// This assumes we have AVX2.
14816static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14817 int BroadcastIdx,
14818 const X86Subtarget &Subtarget,
14819 SelectionDAG &DAG) {
14820 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__
__PRETTY_FUNCTION__))
14821 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__
__PRETTY_FUNCTION__))
;
14822
14823 MVT EltVT = VT.getVectorElementType();
14824 MVT V0VT = V0.getSimpleValueType();
14825
14826 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14826, __extension__
__PRETTY_FUNCTION__))
;
14827 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14827, __extension__
__PRETTY_FUNCTION__))
;
14828
14829 MVT V0EltVT = V0VT.getVectorElementType();
14830 if (!V0EltVT.isInteger())
14831 return SDValue();
14832
14833 const unsigned EltSize = EltVT.getSizeInBits();
14834 const unsigned V0EltSize = V0EltVT.getSizeInBits();
14835
14836 // This is only a truncation if the original element type is larger.
14837 if (V0EltSize <= EltSize)
14838 return SDValue();
14839
14840 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__
__PRETTY_FUNCTION__))
14841 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__
__PRETTY_FUNCTION__))
;
14842
14843 const unsigned V0Opc = V0.getOpcode();
14844 const unsigned Scale = V0EltSize / EltSize;
14845 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14846
14847 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14848 V0Opc != ISD::BUILD_VECTOR)
14849 return SDValue();
14850
14851 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14852
14853 // If we're extracting non-least-significant bits, shift so we can truncate.
14854 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14855 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14856 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14857 if (const int OffsetIdx = BroadcastIdx % Scale)
14858 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14859 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14860
14861 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14862 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14863}
14864
14865/// Test whether this can be lowered with a single SHUFPS instruction.
14866///
14867/// This is used to disable more specialized lowerings when the shufps lowering
14868/// will happen to be efficient.
14869static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14870 // This routine only handles 128-bit shufps.
14871 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14871, __extension__
__PRETTY_FUNCTION__))
;
14872 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14872, __extension__
__PRETTY_FUNCTION__))
;
14873 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14873, __extension__
__PRETTY_FUNCTION__))
;
14874 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14874, __extension__
__PRETTY_FUNCTION__))
;
14875 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14875, __extension__
__PRETTY_FUNCTION__))
;
14876
14877 // To lower with a single SHUFPS we need to have the low half and high half
14878 // each requiring a single input.
14879 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14880 return false;
14881 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14882 return false;
14883
14884 return true;
14885}
14886
14887/// Test whether the specified input (0 or 1) is in-place blended by the
14888/// given mask.
14889///
14890/// This returns true if the elements from a particular input are already in the
14891/// slot required by the given mask and require no permutation.
14892static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
14893 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14893, __extension__
__PRETTY_FUNCTION__))
;
14894 int Size = Mask.size();
14895 for (int i = 0; i < Size; ++i)
14896 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
14897 return false;
14898
14899 return true;
14900}
14901
14902/// If we are extracting two 128-bit halves of a vector and shuffling the
14903/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14904/// multi-shuffle lowering.
14905static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14906 SDValue N1, ArrayRef<int> Mask,
14907 SelectionDAG &DAG) {
14908 MVT VT = N0.getSimpleValueType();
14909 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))
14910 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))
14911 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))
;
14912
14913 // Check that both sources are extracts of the same source vector.
14914 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14915 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14916 N0.getOperand(0) != N1.getOperand(0) ||
14917 !N0.hasOneUse() || !N1.hasOneUse())
14918 return SDValue();
14919
14920 SDValue WideVec = N0.getOperand(0);
14921 MVT WideVT = WideVec.getSimpleValueType();
14922 if (!WideVT.is256BitVector())
14923 return SDValue();
14924
14925 // Match extracts of each half of the wide source vector. Commute the shuffle
14926 // if the extract of the low half is N1.
14927 unsigned NumElts = VT.getVectorNumElements();
14928 SmallVector<int, 4> NewMask(Mask);
14929 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14930 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14931 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14932 ShuffleVectorSDNode::commuteMask(NewMask);
14933 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14934 return SDValue();
14935
14936 // Final bailout: if the mask is simple, we are better off using an extract
14937 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14938 // because that avoids a constant load from memory.
14939 if (NumElts == 4 &&
14940 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
14941 return SDValue();
14942
14943 // Extend the shuffle mask with undef elements.
14944 NewMask.append(NumElts, -1);
14945
14946 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14947 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14948 NewMask);
14949 // This is free: ymm -> xmm.
14950 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14951 DAG.getIntPtrConstant(0, DL));
14952}
14953
14954/// Try to lower broadcast of a single element.
14955///
14956/// For convenience, this code also bundles all of the subtarget feature set
14957/// filtering. While a little annoying to re-dispatch on type here, there isn't
14958/// a convenient way to factor it out.
14959static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14960 SDValue V2, ArrayRef<int> Mask,
14961 const X86Subtarget &Subtarget,
14962 SelectionDAG &DAG) {
14963 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14964 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14965 (Subtarget.hasAVX2() && VT.isInteger())))
14966 return SDValue();
14967
14968 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14969 // we can only broadcast from a register with AVX2.
14970 unsigned NumEltBits = VT.getScalarSizeInBits();
14971 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14972 ? X86ISD::MOVDDUP
14973 : X86ISD::VBROADCAST;
14974 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14975
14976 // Check that the mask is a broadcast.
14977 int BroadcastIdx = getSplatIndex(Mask);
14978 if (BroadcastIdx < 0)
14979 return SDValue();
14980 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))
14981 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))
14982 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))
;
14983
14984 // Go up the chain of (vector) values to find a scalar load that we can
14985 // combine with the broadcast.
14986 // TODO: Combine this logic with findEltLoadSrc() used by
14987 // EltsFromConsecutiveLoads().
14988 int BitOffset = BroadcastIdx * NumEltBits;
14989 SDValue V = V1;
14990 for (;;) {
14991 switch (V.getOpcode()) {
14992 case ISD::BITCAST: {
14993 V = V.getOperand(0);
14994 continue;
14995 }
14996 case ISD::CONCAT_VECTORS: {
14997 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14998 int OpIdx = BitOffset / OpBitWidth;
14999 V = V.getOperand(OpIdx);
15000 BitOffset %= OpBitWidth;
15001 continue;
15002 }
15003 case ISD::EXTRACT_SUBVECTOR: {
15004 // The extraction index adds to the existing offset.
15005 unsigned EltBitWidth = V.getScalarValueSizeInBits();
15006 unsigned Idx = V.getConstantOperandVal(1);
15007 unsigned BeginOffset = Idx * EltBitWidth;
15008 BitOffset += BeginOffset;
15009 V = V.getOperand(0);
15010 continue;
15011 }
15012 case ISD::INSERT_SUBVECTOR: {
15013 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
15014 int EltBitWidth = VOuter.getScalarValueSizeInBits();
15015 int Idx = (int)V.getConstantOperandVal(2);
15016 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
15017 int BeginOffset = Idx * EltBitWidth;
15018 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
15019 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
15020 BitOffset -= BeginOffset;
15021 V = VInner;
15022 } else {
15023 V = VOuter;
15024 }
15025 continue;
15026 }
15027 }
15028 break;
15029 }
15030 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15030, __extension__
__PRETTY_FUNCTION__))
;
15031 BroadcastIdx = BitOffset / NumEltBits;
15032
15033 // Do we need to bitcast the source to retrieve the original broadcast index?
15034 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
15035
15036 // Check if this is a broadcast of a scalar. We special case lowering
15037 // for scalars so that we can more effectively fold with loads.
15038 // If the original value has a larger element type than the shuffle, the
15039 // broadcast element is in essence truncated. Make that explicit to ease
15040 // folding.
15041 if (BitCastSrc && VT.isInteger())
15042 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
15043 DL, VT, V, BroadcastIdx, Subtarget, DAG))
15044 return TruncBroadcast;
15045
15046 // Also check the simpler case, where we can directly reuse the scalar.
15047 if (!BitCastSrc &&
15048 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
15049 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
15050 V = V.getOperand(BroadcastIdx);
15051
15052 // If we can't broadcast from a register, check that the input is a load.
15053 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
15054 return SDValue();
15055 } else if (ISD::isNormalLoad(V.getNode()) &&
15056 cast<LoadSDNode>(V)->isSimple()) {
15057 // We do not check for one-use of the vector load because a broadcast load
15058 // is expected to be a win for code size, register pressure, and possibly
15059 // uops even if the original vector load is not eliminated.
15060
15061 // Reduce the vector load and shuffle to a broadcasted scalar load.
15062 LoadSDNode *Ld = cast<LoadSDNode>(V);
15063 SDValue BaseAddr = Ld->getOperand(1);
15064 MVT SVT = VT.getScalarType();
15065 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
15066 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15066, __extension__
__PRETTY_FUNCTION__))
;
15067 SDValue NewAddr =
15068 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
15069
15070 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
15071 // than MOVDDUP.
15072 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
15073 if (Opcode == X86ISD::VBROADCAST) {
15074 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
15075 SDValue Ops[] = {Ld->getChain(), NewAddr};
15076 V = DAG.getMemIntrinsicNode(
15077 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
15078 DAG.getMachineFunction().getMachineMemOperand(
15079 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15080 DAG.makeEquivalentMemoryOrdering(Ld, V);
15081 return DAG.getBitcast(VT, V);
15082 }
15083 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15083, __extension__
__PRETTY_FUNCTION__))
;
15084 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
15085 DAG.getMachineFunction().getMachineMemOperand(
15086 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15087 DAG.makeEquivalentMemoryOrdering(Ld, V);
15088 } else if (!BroadcastFromReg) {
15089 // We can't broadcast from a vector register.
15090 return SDValue();
15091 } else if (BitOffset != 0) {
15092 // We can only broadcast from the zero-element of a vector register,
15093 // but it can be advantageous to broadcast from the zero-element of a
15094 // subvector.
15095 if (!VT.is256BitVector() && !VT.is512BitVector())
15096 return SDValue();
15097
15098 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
15099 if (VT == MVT::v4f64 || VT == MVT::v4i64)
15100 return SDValue();
15101
15102 // Only broadcast the zero-element of a 128-bit subvector.
15103 if ((BitOffset % 128) != 0)
15104 return SDValue();
15105
15106 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__
__PRETTY_FUNCTION__))
15107 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__
__PRETTY_FUNCTION__))
;
15108 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__
__PRETTY_FUNCTION__))
15109 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__
__PRETTY_FUNCTION__))
;
15110 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
15111 V = extract128BitVector(V, ExtractIdx, DAG, DL);
15112 }
15113
15114 // On AVX we can use VBROADCAST directly for scalar sources.
15115 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
15116 V = DAG.getBitcast(MVT::f64, V);
15117 if (Subtarget.hasAVX()) {
15118 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
15119 return DAG.getBitcast(VT, V);
15120 }
15121 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
15122 }
15123
15124 // If this is a scalar, do the broadcast on this type and bitcast.
15125 if (!V.getValueType().isVector()) {
15126 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__
__PRETTY_FUNCTION__))
15127 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__
__PRETTY_FUNCTION__))
;
15128 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
15129 VT.getVectorNumElements());
15130 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
15131 }
15132
15133 // We only support broadcasting from 128-bit vectors to minimize the
15134 // number of patterns we need to deal with in isel. So extract down to
15135 // 128-bits, removing as many bitcasts as possible.
15136 if (V.getValueSizeInBits() > 128)
15137 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
15138
15139 // Otherwise cast V to a vector with the same element type as VT, but
15140 // possibly narrower than VT. Then perform the broadcast.
15141 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
15142 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
15143 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
15144}
15145
15146// Check for whether we can use INSERTPS to perform the shuffle. We only use
15147// INSERTPS when the V1 elements are already in the correct locations
15148// because otherwise we can just always use two SHUFPS instructions which
15149// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
15150// perform INSERTPS if a single V1 element is out of place and all V2
15151// elements are zeroable.
15152static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
15153 unsigned &InsertPSMask,
15154 const APInt &Zeroable,
15155 ArrayRef<int> Mask, SelectionDAG &DAG) {
15156 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15156, __extension__
__PRETTY_FUNCTION__))
;
15157 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15157, __extension__
__PRETTY_FUNCTION__))
;
15158 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15158, __extension__
__PRETTY_FUNCTION__))
;
15159
15160 // Attempt to match INSERTPS with one element from VA or VB being
15161 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
15162 // are updated.
15163 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
15164 ArrayRef<int> CandidateMask) {
15165 unsigned ZMask = 0;
15166 int VADstIndex = -1;
15167 int VBDstIndex = -1;
15168 bool VAUsedInPlace = false;
15169
15170 for (int i = 0; i < 4; ++i) {
15171 // Synthesize a zero mask from the zeroable elements (includes undefs).
15172 if (Zeroable[i]) {
15173 ZMask |= 1 << i;
15174 continue;
15175 }
15176
15177 // Flag if we use any VA inputs in place.
15178 if (i == CandidateMask[i]) {
15179 VAUsedInPlace = true;
15180 continue;
15181 }
15182
15183 // We can only insert a single non-zeroable element.
15184 if (VADstIndex >= 0 || VBDstIndex >= 0)
15185 return false;
15186
15187 if (CandidateMask[i] < 4) {
15188 // VA input out of place for insertion.
15189 VADstIndex = i;
15190 } else {
15191 // VB input for insertion.
15192 VBDstIndex = i;
15193 }
15194 }
15195
15196 // Don't bother if we have no (non-zeroable) element for insertion.
15197 if (VADstIndex < 0 && VBDstIndex < 0)
15198 return false;
15199
15200 // Determine element insertion src/dst indices. The src index is from the
15201 // start of the inserted vector, not the start of the concatenated vector.
15202 unsigned VBSrcIndex = 0;
15203 if (VADstIndex >= 0) {
15204 // If we have a VA input out of place, we use VA as the V2 element
15205 // insertion and don't use the original V2 at all.
15206 VBSrcIndex = CandidateMask[VADstIndex];
15207 VBDstIndex = VADstIndex;
15208 VB = VA;
15209 } else {
15210 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
15211 }
15212
15213 // If no V1 inputs are used in place, then the result is created only from
15214 // the zero mask and the V2 insertion - so remove V1 dependency.
15215 if (!VAUsedInPlace)
15216 VA = DAG.getUNDEF(MVT::v4f32);
15217
15218 // Update V1, V2 and InsertPSMask accordingly.
15219 V1 = VA;
15220 V2 = VB;
15221
15222 // Insert the V2 element into the desired position.
15223 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
15224 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15224, __extension__
__PRETTY_FUNCTION__))
;
15225 return true;
15226 };
15227
15228 if (matchAsInsertPS(V1, V2, Mask))
15229 return true;
15230
15231 // Commute and try again.
15232 SmallVector<int, 4> CommutedMask(Mask);
15233 ShuffleVectorSDNode::commuteMask(CommutedMask);
15234 if (matchAsInsertPS(V2, V1, CommutedMask))
15235 return true;
15236
15237 return false;
15238}
15239
15240static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
15241 ArrayRef<int> Mask, const APInt &Zeroable,
15242 SelectionDAG &DAG) {
15243 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15243, __extension__
__PRETTY_FUNCTION__))
;
15244 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15244, __extension__
__PRETTY_FUNCTION__))
;
15245
15246 // Attempt to match the insertps pattern.
15247 unsigned InsertPSMask = 0;
15248 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
15249 return SDValue();
15250
15251 // Insert the V2 element into the desired position.
15252 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
15253 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
15254}
15255
15256/// Handle lowering of 2-lane 64-bit floating point shuffles.
15257///
15258/// This is the basis function for the 2-lane 64-bit shuffles as we have full
15259/// support for floating point shuffles but not integer shuffles. These
15260/// instructions will incur a domain crossing penalty on some chips though so
15261/// it is better to avoid lowering through this for integer vectors where
15262/// possible.
15263static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15264 const APInt &Zeroable, SDValue V1, SDValue V2,
15265 const X86Subtarget &Subtarget,
15266 SelectionDAG &DAG) {
15267 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15267, __extension__
__PRETTY_FUNCTION__))
;
15268 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15268, __extension__
__PRETTY_FUNCTION__))
;
15269 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15269, __extension__
__PRETTY_FUNCTION__))
;
15270
15271 if (V2.isUndef()) {
15272 // Check for being able to broadcast a single element.
15273 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
15274 Mask, Subtarget, DAG))
15275 return Broadcast;
15276
15277 // Straight shuffle of a single input vector. Simulate this by using the
15278 // single input as both of the "inputs" to this instruction..
15279 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
15280
15281 if (Subtarget.hasAVX()) {
15282 // If we have AVX, we can use VPERMILPS which will allow folding a load
15283 // into the shuffle.
15284 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
15285 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15286 }
15287
15288 return DAG.getNode(
15289 X86ISD::SHUFP, DL, MVT::v2f64,
15290 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15291 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15292 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15293 }
15294 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15294, __extension__
__PRETTY_FUNCTION__))
;
15295 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15295, __extension__
__PRETTY_FUNCTION__))
;
15296 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15296, __extension__
__PRETTY_FUNCTION__))
;
15297 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15297, __extension__
__PRETTY_FUNCTION__))
;
15298
15299 if (Subtarget.hasAVX2())
15300 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15301 return Extract;
15302
15303 // When loading a scalar and then shuffling it into a vector we can often do
15304 // the insertion cheaply.
15305 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15306 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15307 return Insertion;
15308 // Try inverting the insertion since for v2 masks it is easy to do and we
15309 // can't reliably sort the mask one way or the other.
15310 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
15311 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
15312 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15313 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15314 return Insertion;
15315
15316 // Try to use one of the special instruction patterns to handle two common
15317 // blend patterns if a zero-blend above didn't work.
15318 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
15319 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
15320 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
15321 // We can either use a special instruction to load over the low double or
15322 // to move just the low double.
15323 return DAG.getNode(
15324 X86ISD::MOVSD, DL, MVT::v2f64, V2,
15325 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
15326
15327 if (Subtarget.hasSSE41())
15328 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
15329 Zeroable, Subtarget, DAG))
15330 return Blend;
15331
15332 // Use dedicated unpack instructions for masks that match their pattern.
15333 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
15334 return V;
15335
15336 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
15337 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
15338 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15339}
15340
15341/// Handle lowering of 2-lane 64-bit integer shuffles.
15342///
15343/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
15344/// the integer unit to minimize domain crossing penalties. However, for blends
15345/// it falls back to the floating point shuffle operation with appropriate bit
15346/// casting.
15347static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15348 const APInt &Zeroable, SDValue V1, SDValue V2,
15349 const X86Subtarget &Subtarget,
15350 SelectionDAG &DAG) {
15351 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15351, __extension__
__PRETTY_FUNCTION__))
;
15352 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15352, __extension__
__PRETTY_FUNCTION__))
;
15353 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15353, __extension__
__PRETTY_FUNCTION__))
;
15354
15355 if (V2.isUndef()) {
15356 // Check for being able to broadcast a single element.
15357 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
15358 Mask, Subtarget, DAG))
15359 return Broadcast;
15360
15361 // Straight shuffle of a single input vector. For everything from SSE2
15362 // onward this has a single fast instruction with no scary immediates.
15363 // We have to map the mask as it is actually a v4i32 shuffle instruction.
15364 V1 = DAG.getBitcast(MVT::v4i32, V1);
15365 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
15366 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
15367 Mask[1] < 0 ? -1 : (Mask[1] * 2),
15368 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
15369 return DAG.getBitcast(
15370 MVT::v2i64,
15371 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15372 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
15373 }
15374 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15374, __extension__
__PRETTY_FUNCTION__))
;
15375 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15375, __extension__
__PRETTY_FUNCTION__))
;
15376 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15376, __extension__
__PRETTY_FUNCTION__))
;
15377 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15377, __extension__
__PRETTY_FUNCTION__))
;
15378
15379 if (Subtarget.hasAVX2())
15380 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15381 return Extract;
15382
15383 // Try to use shift instructions.
15384 if (SDValue Shift =
15385 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
15386 DAG, /*BitwiseOnly*/ false))
15387 return Shift;
15388
15389 // When loading a scalar and then shuffling it into a vector we can often do
15390 // the insertion cheaply.
15391 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15392 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15393 return Insertion;
15394 // Try inverting the insertion since for v2 masks it is easy to do and we
15395 // can't reliably sort the mask one way or the other.
15396 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
15397 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15398 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15399 return Insertion;
15400
15401 // We have different paths for blend lowering, but they all must use the
15402 // *exact* same predicate.
15403 bool IsBlendSupported = Subtarget.hasSSE41();
15404 if (IsBlendSupported)
15405 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
15406 Zeroable, Subtarget, DAG))
15407 return Blend;
15408
15409 // Use dedicated unpack instructions for masks that match their pattern.
15410 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
15411 return V;
15412
15413 // Try to use byte rotation instructions.
15414 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15415 if (Subtarget.hasSSSE3()) {
15416 if (Subtarget.hasVLX())
15417 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
15418 Subtarget, DAG))
15419 return Rotate;
15420
15421 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
15422 Subtarget, DAG))
15423 return Rotate;
15424 }
15425
15426 // If we have direct support for blends, we should lower by decomposing into
15427 // a permute. That will be faster than the domain cross.
15428 if (IsBlendSupported)
15429 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
15430 Subtarget, DAG);
15431
15432 // We implement this with SHUFPD which is pretty lame because it will likely
15433 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
15434 // However, all the alternatives are still more cycles and newer chips don't
15435 // have this problem. It would be really nice if x86 had better shuffles here.
15436 V1 = DAG.getBitcast(MVT::v2f64, V1);
15437 V2 = DAG.getBitcast(MVT::v2f64, V2);
15438 return DAG.getBitcast(MVT::v2i64,
15439 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
15440}
15441
15442/// Lower a vector shuffle using the SHUFPS instruction.
15443///
15444/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
15445/// It makes no assumptions about whether this is the *best* lowering, it simply
15446/// uses it.
15447static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
15448 ArrayRef<int> Mask, SDValue V1,
15449 SDValue V2, SelectionDAG &DAG) {
15450 SDValue LowV = V1, HighV = V2;
15451 SmallVector<int, 4> NewMask(Mask);
15452 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15453
15454 if (NumV2Elements == 1) {
15455 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
15456
15457 // Compute the index adjacent to V2Index and in the same half by toggling
15458 // the low bit.
15459 int V2AdjIndex = V2Index ^ 1;
15460
15461 if (Mask[V2AdjIndex] < 0) {
15462 // Handles all the cases where we have a single V2 element and an undef.
15463 // This will only ever happen in the high lanes because we commute the
15464 // vector otherwise.
15465 if (V2Index < 2)
15466 std::swap(LowV, HighV);
15467 NewMask[V2Index] -= 4;
15468 } else {
15469 // Handle the case where the V2 element ends up adjacent to a V1 element.
15470 // To make this work, blend them together as the first step.
15471 int V1Index = V2AdjIndex;
15472 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
15473 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
15474 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15475
15476 // Now proceed to reconstruct the final blend as we have the necessary
15477 // high or low half formed.
15478 if (V2Index < 2) {
15479 LowV = V2;
15480 HighV = V1;
15481 } else {
15482 HighV = V2;
15483 }
15484 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
15485 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
15486 }
15487 } else if (NumV2Elements == 2) {
15488 if (Mask[0] < 4 && Mask[1] < 4) {
15489 // Handle the easy case where we have V1 in the low lanes and V2 in the
15490 // high lanes.
15491 NewMask[2] -= 4;
15492 NewMask[3] -= 4;
15493 } else if (Mask[2] < 4 && Mask[3] < 4) {
15494 // We also handle the reversed case because this utility may get called
15495 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
15496 // arrange things in the right direction.
15497 NewMask[0] -= 4;
15498 NewMask[1] -= 4;
15499 HighV = V1;
15500 LowV = V2;
15501 } else {
15502 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
15503 // trying to place elements directly, just blend them and set up the final
15504 // shuffle to place them.
15505
15506 // The first two blend mask elements are for V1, the second two are for
15507 // V2.
15508 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
15509 Mask[2] < 4 ? Mask[2] : Mask[3],
15510 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
15511 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
15512 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15513 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15514
15515 // Now we do a normal shuffle of V1 by giving V1 as both operands to
15516 // a blend.
15517 LowV = HighV = V1;
15518 NewMask[0] = Mask[0] < 4 ? 0 : 2;
15519 NewMask[1] = Mask[0] < 4 ? 2 : 0;
15520 NewMask[2] = Mask[2] < 4 ? 1 : 3;
15521 NewMask[3] = Mask[2] < 4 ? 3 : 1;
15522 }
15523 } else if (NumV2Elements == 3) {
15524 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
15525 // we can get here due to other paths (e.g repeated mask matching) that we
15526 // don't want to do another round of lowerVECTOR_SHUFFLE.
15527 ShuffleVectorSDNode::commuteMask(NewMask);
15528 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
15529 }
15530 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
15531 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
15532}
15533
15534/// Lower 4-lane 32-bit floating point shuffles.
15535///
15536/// Uses instructions exclusively from the floating point unit to minimize
15537/// domain crossing penalties, as these are sufficient to implement all v4f32
15538/// shuffles.
15539static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15540 const APInt &Zeroable, SDValue V1, SDValue V2,
15541 const X86Subtarget &Subtarget,
15542 SelectionDAG &DAG) {
15543 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15543, __extension__
__PRETTY_FUNCTION__))
;
15544 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15544, __extension__
__PRETTY_FUNCTION__))
;
15545 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15545, __extension__
__PRETTY_FUNCTION__))
;
15546
15547 if (Subtarget.hasSSE41())
15548 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15549 Zeroable, Subtarget, DAG))
15550 return Blend;
15551
15552 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15553
15554 if (NumV2Elements == 0) {
15555 // Check for being able to broadcast a single element.
15556 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
15557 Mask, Subtarget, DAG))
15558 return Broadcast;
15559
15560 // Use even/odd duplicate instructions for masks that match their pattern.
15561 if (Subtarget.hasSSE3()) {
15562 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15563 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
15564 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
15565 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
15566 }
15567
15568 if (Subtarget.hasAVX()) {
15569 // If we have AVX, we can use VPERMILPS which will allow folding a load
15570 // into the shuffle.
15571 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
15572 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15573 }
15574
15575 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
15576 // in SSE1 because otherwise they are widened to v2f64 and never get here.
15577 if (!Subtarget.hasSSE2()) {
15578 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
15579 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
15580 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
15581 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
15582 }
15583
15584 // Otherwise, use a straight shuffle of a single input vector. We pass the
15585 // input vector to both operands to simulate this with a SHUFPS.
15586 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
15587 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15588 }
15589
15590 if (Subtarget.hasSSE2())
15591 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
15592 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
15593 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
15594 return ZExt;
15595 }
15596
15597 if (Subtarget.hasAVX2())
15598 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15599 return Extract;
15600
15601 // There are special ways we can lower some single-element blends. However, we
15602 // have custom ways we can lower more complex single-element blends below that
15603 // we defer to if both this and BLENDPS fail to match, so restrict this to
15604 // when the V2 input is targeting element 0 of the mask -- that is the fast
15605 // case here.
15606 if (NumV2Elements == 1 && Mask[0] >= 4)
15607 if (SDValue V = lowerShuffleAsElementInsertion(
15608 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15609 return V;
15610
15611 if (Subtarget.hasSSE41()) {
15612 // Use INSERTPS if we can complete the shuffle efficiently.
15613 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
15614 return V;
15615
15616 if (!isSingleSHUFPSMask(Mask))
15617 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
15618 V2, Mask, DAG))
15619 return BlendPerm;
15620 }
15621
15622 // Use low/high mov instructions. These are only valid in SSE1 because
15623 // otherwise they are widened to v2f64 and never get here.
15624 if (!Subtarget.hasSSE2()) {
15625 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15626 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15627 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15628 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15629 }
15630
15631 // Use dedicated unpack instructions for masks that match their pattern.
15632 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15633 return V;
15634
15635 // Otherwise fall back to a SHUFPS lowering strategy.
15636 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15637}
15638
15639/// Lower 4-lane i32 vector shuffles.
15640///
15641/// We try to handle these with integer-domain shuffles where we can, but for
15642/// blends we use the floating point domain blend instructions.
15643static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15644 const APInt &Zeroable, SDValue V1, SDValue V2,
15645 const X86Subtarget &Subtarget,
15646 SelectionDAG &DAG) {
15647 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15647, __extension__
__PRETTY_FUNCTION__))
;
15648 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15648, __extension__
__PRETTY_FUNCTION__))
;
15649 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15649, __extension__
__PRETTY_FUNCTION__))
;
15650
15651 // Whenever we can lower this as a zext, that instruction is strictly faster
15652 // than any alternative. It also allows us to fold memory operands into the
15653 // shuffle in many cases.
15654 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15655 Zeroable, Subtarget, DAG))
15656 return ZExt;
15657
15658 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15659
15660 // Try to use shift instructions if fast.
15661 if (Subtarget.preferLowerShuffleAsShift()) {
15662 if (SDValue Shift =
15663 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
15664 Subtarget, DAG, /*BitwiseOnly*/ true))
15665 return Shift;
15666 if (NumV2Elements == 0)
15667 if (SDValue Rotate =
15668 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
15669 return Rotate;
15670 }
15671
15672 if (NumV2Elements == 0) {
15673 // Try to use broadcast unless the mask only has one non-undef element.
15674 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15675 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15676 Mask, Subtarget, DAG))
15677 return Broadcast;
15678 }
15679
15680 // Straight shuffle of a single input vector. For everything from SSE2
15681 // onward this has a single fast instruction with no scary immediates.
15682 // We coerce the shuffle pattern to be compatible with UNPCK instructions
15683 // but we aren't actually going to use the UNPCK instruction because doing
15684 // so prevents folding a load into this instruction or making a copy.
15685 const int UnpackLoMask[] = {0, 0, 1, 1};
15686 const int UnpackHiMask[] = {2, 2, 3, 3};
15687 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15688 Mask = UnpackLoMask;
15689 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15690 Mask = UnpackHiMask;
15691
15692 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15693 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15694 }
15695
15696 if (Subtarget.hasAVX2())
15697 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15698 return Extract;
15699
15700 // Try to use shift instructions.
15701 if (SDValue Shift =
15702 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
15703 DAG, /*BitwiseOnly*/ false))
15704 return Shift;
15705
15706 // There are special ways we can lower some single-element blends.
15707 if (NumV2Elements == 1)
15708 if (SDValue V = lowerShuffleAsElementInsertion(
15709 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15710 return V;
15711
15712 // We have different paths for blend lowering, but they all must use the
15713 // *exact* same predicate.
15714 bool IsBlendSupported = Subtarget.hasSSE41();
15715 if (IsBlendSupported)
15716 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15717 Zeroable, Subtarget, DAG))
15718 return Blend;
15719
15720 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15721 Zeroable, Subtarget, DAG))
15722 return Masked;
15723
15724 // Use dedicated unpack instructions for masks that match their pattern.
15725 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15726 return V;
15727
15728 // Try to use byte rotation instructions.
15729 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15730 if (Subtarget.hasSSSE3()) {
15731 if (Subtarget.hasVLX())
15732 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15733 Subtarget, DAG))
15734 return Rotate;
15735
15736 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15737 Subtarget, DAG))
15738 return Rotate;
15739 }
15740
15741 // Assume that a single SHUFPS is faster than an alternative sequence of
15742 // multiple instructions (even if the CPU has a domain penalty).
15743 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15744 if (!isSingleSHUFPSMask(Mask)) {
15745 // If we have direct support for blends, we should lower by decomposing into
15746 // a permute. That will be faster than the domain cross.
15747 if (IsBlendSupported)
15748 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15749 Subtarget, DAG);
15750
15751 // Try to lower by permuting the inputs into an unpack instruction.
15752 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15753 Mask, Subtarget, DAG))
15754 return Unpack;
15755 }
15756
15757 // We implement this with SHUFPS because it can blend from two vectors.
15758 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15759 // up the inputs, bypassing domain shift penalties that we would incur if we
15760 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15761 // relevant.
15762 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15763 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15764 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15765 return DAG.getBitcast(MVT::v4i32, ShufPS);
15766}
15767
15768/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15769/// shuffle lowering, and the most complex part.
15770///
15771/// The lowering strategy is to try to form pairs of input lanes which are
15772/// targeted at the same half of the final vector, and then use a dword shuffle
15773/// to place them onto the right half, and finally unpack the paired lanes into
15774/// their final position.
15775///
15776/// The exact breakdown of how to form these dword pairs and align them on the
15777/// correct sides is really tricky. See the comments within the function for
15778/// more of the details.
15779///
15780/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15781/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15782/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15783/// vector, form the analogous 128-bit 8-element Mask.
15784static SDValue lowerV8I16GeneralSingleInputShuffle(
15785 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15786 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15787 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15787, __extension__
__PRETTY_FUNCTION__))
;
15788 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15789
15790 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15790, __extension__
__PRETTY_FUNCTION__))
;
15791 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15792 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15793
15794 // Attempt to directly match PSHUFLW or PSHUFHW.
15795 if (isUndefOrInRange(LoMask, 0, 4) &&
15796 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15797 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15798 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15799 }
15800 if (isUndefOrInRange(HiMask, 4, 8) &&
15801 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15802 for (int i = 0; i != 4; ++i)
15803 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15804 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15805 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15806 }
15807
15808 SmallVector<int, 4> LoInputs;
15809 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15810 array_pod_sort(LoInputs.begin(), LoInputs.end());
15811 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15812 SmallVector<int, 4> HiInputs;
15813 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15814 array_pod_sort(HiInputs.begin(), HiInputs.end());
15815 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15816 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15817 int NumHToL = LoInputs.size() - NumLToL;
15818 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15819 int NumHToH = HiInputs.size() - NumLToH;
15820 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15821 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15822 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15823 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15824
15825 // If we are shuffling values from one half - check how many different DWORD
15826 // pairs we need to create. If only 1 or 2 then we can perform this as a
15827 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15828 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15829 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15830 V = DAG.getNode(ShufWOp, DL, VT, V,
15831 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15832 V = DAG.getBitcast(PSHUFDVT, V);
15833 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15834 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15835 return DAG.getBitcast(VT, V);
15836 };
15837
15838 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15839 int PSHUFDMask[4] = { -1, -1, -1, -1 };
15840 SmallVector<std::pair<int, int>, 4> DWordPairs;
15841 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15842
15843 // Collect the different DWORD pairs.
15844 for (int DWord = 0; DWord != 4; ++DWord) {
15845 int M0 = Mask[2 * DWord + 0];
15846 int M1 = Mask[2 * DWord + 1];
15847 M0 = (M0 >= 0 ? M0 % 4 : M0);
15848 M1 = (M1 >= 0 ? M1 % 4 : M1);
15849 if (M0 < 0 && M1 < 0)
15850 continue;
15851
15852 bool Match = false;
15853 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15854 auto &DWordPair = DWordPairs[j];
15855 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15856 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15857 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15858 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15859 PSHUFDMask[DWord] = DOffset + j;
15860 Match = true;
15861 break;
15862 }
15863 }
15864 if (!Match) {
15865 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15866 DWordPairs.push_back(std::make_pair(M0, M1));
15867 }
15868 }
15869
15870 if (DWordPairs.size() <= 2) {
15871 DWordPairs.resize(2, std::make_pair(-1, -1));
15872 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15873 DWordPairs[1].first, DWordPairs[1].second};
15874 if ((NumHToL + NumHToH) == 0)
15875 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15876 if ((NumLToL + NumLToH) == 0)
15877 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15878 }
15879 }
15880
15881 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15882 // such inputs we can swap two of the dwords across the half mark and end up
15883 // with <=2 inputs to each half in each half. Once there, we can fall through
15884 // to the generic code below. For example:
15885 //
15886 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15887 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15888 //
15889 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15890 // and an existing 2-into-2 on the other half. In this case we may have to
15891 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15892 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15893 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15894 // because any other situation (including a 3-into-1 or 1-into-3 in the other
15895 // half than the one we target for fixing) will be fixed when we re-enter this
15896 // path. We will also combine away any sequence of PSHUFD instructions that
15897 // result into a single instruction. Here is an example of the tricky case:
15898 //
15899 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15900 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15901 //
15902 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15903 //
15904 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15905 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15906 //
15907 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15908 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15909 //
15910 // The result is fine to be handled by the generic logic.
15911 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15912 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15913 int AOffset, int BOffset) {
15914 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__
__PRETTY_FUNCTION__))
15915 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__
__PRETTY_FUNCTION__))
;
15916 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__
__PRETTY_FUNCTION__))
15917 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__
__PRETTY_FUNCTION__))
;
15918 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__
__PRETTY_FUNCTION__))
15919 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__
__PRETTY_FUNCTION__))
;
15920
15921 bool ThreeAInputs = AToAInputs.size() == 3;
15922
15923 // Compute the index of dword with only one word among the three inputs in
15924 // a half by taking the sum of the half with three inputs and subtracting
15925 // the sum of the actual three inputs. The difference is the remaining
15926 // slot.
15927 int ADWord = 0, BDWord = 0;
15928 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15929 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15930 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15931 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15932 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15933 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15934 int TripleNonInputIdx =
15935 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15936 TripleDWord = TripleNonInputIdx / 2;
15937
15938 // We use xor with one to compute the adjacent DWord to whichever one the
15939 // OneInput is in.
15940 OneInputDWord = (OneInput / 2) ^ 1;
15941
15942 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15943 // and BToA inputs. If there is also such a problem with the BToB and AToB
15944 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15945 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15946 // is essential that we don't *create* a 3<-1 as then we might oscillate.
15947 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15948 // Compute how many inputs will be flipped by swapping these DWords. We
15949 // need
15950 // to balance this to ensure we don't form a 3-1 shuffle in the other
15951 // half.
15952 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15953 llvm::count(AToBInputs, 2 * ADWord + 1);
15954 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15955 llvm::count(BToBInputs, 2 * BDWord + 1);
15956 if ((NumFlippedAToBInputs == 1 &&
15957 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15958 (NumFlippedBToBInputs == 1 &&
15959 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15960 // We choose whether to fix the A half or B half based on whether that
15961 // half has zero flipped inputs. At zero, we may not be able to fix it
15962 // with that half. We also bias towards fixing the B half because that
15963 // will more commonly be the high half, and we have to bias one way.
15964 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15965 ArrayRef<int> Inputs) {
15966 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15967 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15968 // Determine whether the free index is in the flipped dword or the
15969 // unflipped dword based on where the pinned index is. We use this bit
15970 // in an xor to conditionally select the adjacent dword.
15971 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15972 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15973 if (IsFixIdxInput == IsFixFreeIdxInput)
15974 FixFreeIdx += 1;
15975 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15976 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__
__PRETTY_FUNCTION__))
15977 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__
__PRETTY_FUNCTION__))
;
15978 int PSHUFHalfMask[] = {0, 1, 2, 3};
15979 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15980 V = DAG.getNode(
15981 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15982 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15983 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15984
15985 for (int &M : Mask)
15986 if (M >= 0 && M == FixIdx)
15987 M = FixFreeIdx;
15988 else if (M >= 0 && M == FixFreeIdx)
15989 M = FixIdx;
15990 };
15991 if (NumFlippedBToBInputs != 0) {
15992 int BPinnedIdx =
15993 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15994 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15995 } else {
15996 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15996, __extension__
__PRETTY_FUNCTION__))
;
15997 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15998 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15999 }
16000 }
16001 }
16002
16003 int PSHUFDMask[] = {0, 1, 2, 3};
16004 PSHUFDMask[ADWord] = BDWord;
16005 PSHUFDMask[BDWord] = ADWord;
16006 V = DAG.getBitcast(
16007 VT,
16008 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16009 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16010
16011 // Adjust the mask to match the new locations of A and B.
16012 for (int &M : Mask)
16013 if (M >= 0 && M/2 == ADWord)
16014 M = 2 * BDWord + M % 2;
16015 else if (M >= 0 && M/2 == BDWord)
16016 M = 2 * ADWord + M % 2;
16017
16018 // Recurse back into this routine to re-compute state now that this isn't
16019 // a 3 and 1 problem.
16020 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
16021 };
16022 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
16023 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
16024 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
16025 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
16026
16027 // At this point there are at most two inputs to the low and high halves from
16028 // each half. That means the inputs can always be grouped into dwords and
16029 // those dwords can then be moved to the correct half with a dword shuffle.
16030 // We use at most one low and one high word shuffle to collect these paired
16031 // inputs into dwords, and finally a dword shuffle to place them.
16032 int PSHUFLMask[4] = {-1, -1, -1, -1};
16033 int PSHUFHMask[4] = {-1, -1, -1, -1};
16034 int PSHUFDMask[4] = {-1, -1, -1, -1};
16035
16036 // First fix the masks for all the inputs that are staying in their
16037 // original halves. This will then dictate the targets of the cross-half
16038 // shuffles.
16039 auto fixInPlaceInputs =
16040 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
16041 MutableArrayRef<int> SourceHalfMask,
16042 MutableArrayRef<int> HalfMask, int HalfOffset) {
16043 if (InPlaceInputs.empty())
16044 return;
16045 if (InPlaceInputs.size() == 1) {
16046 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
16047 InPlaceInputs[0] - HalfOffset;
16048 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
16049 return;
16050 }
16051 if (IncomingInputs.empty()) {
16052 // Just fix all of the in place inputs.
16053 for (int Input : InPlaceInputs) {
16054 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
16055 PSHUFDMask[Input / 2] = Input / 2;
16056 }
16057 return;
16058 }
16059
16060 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16060, __extension__
__PRETTY_FUNCTION__))
;
16061 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
16062 InPlaceInputs[0] - HalfOffset;
16063 // Put the second input next to the first so that they are packed into
16064 // a dword. We find the adjacent index by toggling the low bit.
16065 int AdjIndex = InPlaceInputs[0] ^ 1;
16066 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
16067 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
16068 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
16069 };
16070 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
16071 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
16072
16073 // Now gather the cross-half inputs and place them into a free dword of
16074 // their target half.
16075 // FIXME: This operation could almost certainly be simplified dramatically to
16076 // look more like the 3-1 fixing operation.
16077 auto moveInputsToRightHalf = [&PSHUFDMask](
16078 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
16079 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
16080 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
16081 int DestOffset) {
16082 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
16083 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
16084 };
16085 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
16086 int Word) {
16087 int LowWord = Word & ~1;
16088 int HighWord = Word | 1;
16089 return isWordClobbered(SourceHalfMask, LowWord) ||
16090 isWordClobbered(SourceHalfMask, HighWord);
16091 };
16092
16093 if (IncomingInputs.empty())
16094 return;
16095
16096 if (ExistingInputs.empty()) {
16097 // Map any dwords with inputs from them into the right half.
16098 for (int Input : IncomingInputs) {
16099 // If the source half mask maps over the inputs, turn those into
16100 // swaps and use the swapped lane.
16101 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
16102 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
16103 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
16104 Input - SourceOffset;
16105 // We have to swap the uses in our half mask in one sweep.
16106 for (int &M : HalfMask)
16107 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
16108 M = Input;
16109 else if (M == Input)
16110 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16111 } else {
16112 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))
16113 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))
16114 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))
;
16115 }
16116 // Note that this correctly re-maps both when we do a swap and when
16117 // we observe the other side of the swap above. We rely on that to
16118 // avoid swapping the members of the input list directly.
16119 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16120 }
16121
16122 // Map the input's dword into the correct half.
16123 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
16124 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
16125 else
16126 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
16127 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
16128 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
;
16129 }
16130
16131 // And just directly shift any other-half mask elements to be same-half
16132 // as we will have mirrored the dword containing the element into the
16133 // same position within that half.
16134 for (int &M : HalfMask)
16135 if (M >= SourceOffset && M < SourceOffset + 4) {
16136 M = M - SourceOffset + DestOffset;
16137 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16137, __extension__
__PRETTY_FUNCTION__))
;
16138 }
16139 return;
16140 }
16141
16142 // Ensure we have the input in a viable dword of its current half. This
16143 // is particularly tricky because the original position may be clobbered
16144 // by inputs being moved and *staying* in that half.
16145 if (IncomingInputs.size() == 1) {
16146 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16147 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
16148 SourceOffset;
16149 SourceHalfMask[InputFixed - SourceOffset] =
16150 IncomingInputs[0] - SourceOffset;
16151 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
16152 InputFixed);
16153 IncomingInputs[0] = InputFixed;
16154 }
16155 } else if (IncomingInputs.size() == 2) {
16156 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
16157 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16158 // We have two non-adjacent or clobbered inputs we need to extract from
16159 // the source half. To do this, we need to map them into some adjacent
16160 // dword slot in the source mask.
16161 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
16162 IncomingInputs[1] - SourceOffset};
16163
16164 // If there is a free slot in the source half mask adjacent to one of
16165 // the inputs, place the other input in it. We use (Index XOR 1) to
16166 // compute an adjacent index.
16167 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
16168 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
16169 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
16170 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16171 InputsFixed[1] = InputsFixed[0] ^ 1;
16172 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
16173 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
16174 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
16175 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
16176 InputsFixed[0] = InputsFixed[1] ^ 1;
16177 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
16178 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
16179 // The two inputs are in the same DWord but it is clobbered and the
16180 // adjacent DWord isn't used at all. Move both inputs to the free
16181 // slot.
16182 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
16183 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
16184 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
16185 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
16186 } else {
16187 // The only way we hit this point is if there is no clobbering
16188 // (because there are no off-half inputs to this half) and there is no
16189 // free slot adjacent to one of the inputs. In this case, we have to
16190 // swap an input with a non-input.
16191 for (int i = 0; i < 4; ++i)
16192 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))
16193 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))
;
16194 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__
__PRETTY_FUNCTION__))
16195 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__
__PRETTY_FUNCTION__))
;
16196
16197 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16198 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
16199
16200 // We also have to update the final source mask in this case because
16201 // it may need to undo the above swap.
16202 for (int &M : FinalSourceHalfMask)
16203 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
16204 M = InputsFixed[1] + SourceOffset;
16205 else if (M == InputsFixed[1] + SourceOffset)
16206 M = (InputsFixed[0] ^ 1) + SourceOffset;
16207
16208 InputsFixed[1] = InputsFixed[0] ^ 1;
16209 }
16210
16211 // Point everything at the fixed inputs.
16212 for (int &M : HalfMask)
16213 if (M == IncomingInputs[0])
16214 M = InputsFixed[0] + SourceOffset;
16215 else if (M == IncomingInputs[1])
16216 M = InputsFixed[1] + SourceOffset;
16217
16218 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
16219 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
16220 }
16221 } else {
16222 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16222)
;
16223 }
16224
16225 // Now hoist the DWord down to the right half.
16226 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
16227 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16227, __extension__
__PRETTY_FUNCTION__))
;
16228 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
16229 for (int &M : HalfMask)
16230 for (int Input : IncomingInputs)
16231 if (M == Input)
16232 M = FreeDWord * 2 + Input % 2;
16233 };
16234 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
16235 /*SourceOffset*/ 4, /*DestOffset*/ 0);
16236 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
16237 /*SourceOffset*/ 0, /*DestOffset*/ 4);
16238
16239 // Now enact all the shuffles we've computed to move the inputs into their
16240 // target half.
16241 if (!isNoopShuffleMask(PSHUFLMask))
16242 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16243 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
16244 if (!isNoopShuffleMask(PSHUFHMask))
16245 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16246 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
16247 if (!isNoopShuffleMask(PSHUFDMask))
16248 V = DAG.getBitcast(
16249 VT,
16250 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16251 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16252
16253 // At this point, each half should contain all its inputs, and we can then
16254 // just shuffle them into their final position.
16255 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__
__PRETTY_FUNCTION__))
16256 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__
__PRETTY_FUNCTION__))
;
16257 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__
__PRETTY_FUNCTION__))
16258 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__
__PRETTY_FUNCTION__))
;
16259
16260 // Do a half shuffle for the low mask.
16261 if (!isNoopShuffleMask(LoMask))
16262 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16263 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
16264
16265 // Do a half shuffle with the high mask after shifting its values down.
16266 for (int &M : HiMask)
16267 if (M >= 0)
16268 M -= 4;
16269 if (!isNoopShuffleMask(HiMask))
16270 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16271 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
16272
16273 return V;
16274}
16275
16276/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
16277/// blend if only one input is used.
16278static SDValue lowerShuffleAsBlendOfPSHUFBs(
16279 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16280 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
16281 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__))
16282 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__))
;
16283
16284 int NumBytes = VT.getSizeInBits() / 8;
16285 int Size = Mask.size();
16286 int Scale = NumBytes / Size;
16287
16288 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16289 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16290 V1InUse = false;
16291 V2InUse = false;
16292
16293 for (int i = 0; i < NumBytes; ++i) {
16294 int M = Mask[i / Scale];
16295 if (M < 0)
16296 continue;
16297
16298 const int ZeroMask = 0x80;
16299 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
16300 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
16301 if (Zeroable[i / Scale])
16302 V1Idx = V2Idx = ZeroMask;
16303
16304 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
16305 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
16306 V1InUse |= (ZeroMask != V1Idx);
16307 V2InUse |= (ZeroMask != V2Idx);
16308 }
16309
16310 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
16311 if (V1InUse)
16312 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
16313 DAG.getBuildVector(ShufVT, DL, V1Mask));
16314 if (V2InUse)
16315 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
16316 DAG.getBuildVector(ShufVT, DL, V2Mask));
16317
16318 // If we need shuffled inputs from both, blend the two.
16319 SDValue V;
16320 if (V1InUse && V2InUse)
16321 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
16322 else
16323 V = V1InUse ? V1 : V2;
16324
16325 // Cast the result back to the correct type.
16326 return DAG.getBitcast(VT, V);
16327}
16328
16329/// Generic lowering of 8-lane i16 shuffles.
16330///
16331/// This handles both single-input shuffles and combined shuffle/blends with
16332/// two inputs. The single input shuffles are immediately delegated to
16333/// a dedicated lowering routine.
16334///
16335/// The blends are lowered in one of three fundamental ways. If there are few
16336/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
16337/// of the input is significantly cheaper when lowered as an interleaving of
16338/// the two inputs, try to interleave them. Otherwise, blend the low and high
16339/// halves of the inputs separately (making them have relatively few inputs)
16340/// and then concatenate them.
16341static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16342 const APInt &Zeroable, SDValue V1, SDValue V2,
16343 const X86Subtarget &Subtarget,
16344 SelectionDAG &DAG) {
16345 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16345, __extension__
__PRETTY_FUNCTION__))
;
16346 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16346, __extension__
__PRETTY_FUNCTION__))
;
16347 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16347, __extension__
__PRETTY_FUNCTION__))
;
16348
16349 // Whenever we can lower this as a zext, that instruction is strictly faster
16350 // than any alternative.
16351 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
16352 Zeroable, Subtarget, DAG))
16353 return ZExt;
16354
16355 // Try to use lower using a truncation.
16356 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16357 Subtarget, DAG))
16358 return V;
16359
16360 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
16361
16362 if (NumV2Inputs == 0) {
16363 // Try to use shift instructions.
16364 if (SDValue Shift =
16365 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
16366 Subtarget, DAG, /*BitwiseOnly*/ false))
16367 return Shift;
16368
16369 // Check for being able to broadcast a single element.
16370 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
16371 Mask, Subtarget, DAG))
16372 return Broadcast;
16373
16374 // Try to use bit rotation instructions.
16375 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
16376 Subtarget, DAG))
16377 return Rotate;
16378
16379 // Use dedicated unpack instructions for masks that match their pattern.
16380 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16381 return V;
16382
16383 // Use dedicated pack instructions for masks that match their pattern.
16384 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16385 Subtarget))
16386 return V;
16387
16388 // Try to use byte rotation instructions.
16389 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
16390 Subtarget, DAG))
16391 return Rotate;
16392
16393 // Make a copy of the mask so it can be modified.
16394 SmallVector<int, 8> MutableMask(Mask);
16395 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
16396 Subtarget, DAG);
16397 }
16398
16399 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))
16400 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))
16401 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))
;
16402
16403 // Try to use shift instructions.
16404 if (SDValue Shift =
16405 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
16406 DAG, /*BitwiseOnly*/ false))
16407 return Shift;
16408
16409 // See if we can use SSE4A Extraction / Insertion.
16410 if (Subtarget.hasSSE4A())
16411 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
16412 Zeroable, DAG))
16413 return V;
16414
16415 // There are special ways we can lower some single-element blends.
16416 if (NumV2Inputs == 1)
16417 if (SDValue V = lowerShuffleAsElementInsertion(
16418 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16419 return V;
16420
16421 // We have different paths for blend lowering, but they all must use the
16422 // *exact* same predicate.
16423 bool IsBlendSupported = Subtarget.hasSSE41();
16424 if (IsBlendSupported)
16425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
16426 Zeroable, Subtarget, DAG))
16427 return Blend;
16428
16429 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
16430 Zeroable, Subtarget, DAG))
16431 return Masked;
16432
16433 // Use dedicated unpack instructions for masks that match their pattern.
16434 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16435 return V;
16436
16437 // Use dedicated pack instructions for masks that match their pattern.
16438 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16439 Subtarget))
16440 return V;
16441
16442 // Try to use lower using a truncation.
16443 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16444 Subtarget, DAG))
16445 return V;
16446
16447 // Try to use byte rotation instructions.
16448 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
16449 Subtarget, DAG))
16450 return Rotate;
16451
16452 if (SDValue BitBlend =
16453 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
16454 return BitBlend;
16455
16456 // Try to use byte shift instructions to mask.
16457 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
16458 Zeroable, Subtarget, DAG))
16459 return V;
16460
16461 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
16462 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
16463 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
16464 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
16465 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
16466 !Subtarget.hasVLX()) {
16467 // Check if this is part of a 256-bit vector truncation.
16468 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
16469 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16470 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
16471 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
16472 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
16473 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
16474 DAG.getTargetConstant(0xEE, DL, MVT::i8));
16475 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
16476 V1 = extract128BitVector(V1V2, 0, DAG, DL);
16477 V2 = extract128BitVector(V1V2, 4, DAG, DL);
16478 } else {
16479 SmallVector<SDValue, 4> DWordClearOps(4,
16480 DAG.getConstant(0, DL, MVT::i32));
16481 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
16482 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
16483 SDValue DWordClearMask =
16484 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
16485 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
16486 DWordClearMask);
16487 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
16488 DWordClearMask);
16489 }
16490 // Now pack things back together.
16491 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
16492 if (NumEvenDrops == 2) {
16493 Result = DAG.getBitcast(MVT::v4i32, Result);
16494 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
16495 }
16496 return Result;
16497 }
16498
16499 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
16500 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
16501 if (NumOddDrops == 1) {
16502 bool HasSSE41 = Subtarget.hasSSE41();
16503 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16504 DAG.getBitcast(MVT::v4i32, V1),
16505 DAG.getTargetConstant(16, DL, MVT::i8));
16506 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16507 DAG.getBitcast(MVT::v4i32, V2),
16508 DAG.getTargetConstant(16, DL, MVT::i8));
16509 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
16510 MVT::v8i16, V1, V2);
16511 }
16512
16513 // Try to lower by permuting the inputs into an unpack instruction.
16514 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
16515 Mask, Subtarget, DAG))
16516 return Unpack;
16517
16518 // If we can't directly blend but can use PSHUFB, that will be better as it
16519 // can both shuffle and set up the inefficient blend.
16520 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
16521 bool V1InUse, V2InUse;
16522 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
16523 Zeroable, DAG, V1InUse, V2InUse);
16524 }
16525
16526 // We can always bit-blend if we have to so the fallback strategy is to
16527 // decompose into single-input permutes and blends/unpacks.
16528 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
16529 Mask, Subtarget, DAG);
16530}
16531
16532/// Lower 8-lane 16-bit floating point shuffles.
16533static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16534 const APInt &Zeroable, SDValue V1, SDValue V2,
16535 const X86Subtarget &Subtarget,
16536 SelectionDAG &DAG) {
16537 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16537, __extension__
__PRETTY_FUNCTION__))
;
16538 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16538, __extension__
__PRETTY_FUNCTION__))
;
16539 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16539, __extension__
__PRETTY_FUNCTION__))
;
16540 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16541
16542 if (Subtarget.hasFP16()) {
16543 if (NumV2Elements == 0) {
16544 // Check for being able to broadcast a single element.
16545 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
16546 Mask, Subtarget, DAG))
16547 return Broadcast;
16548 }
16549 if (NumV2Elements == 1 && Mask[0] >= 8)
16550 if (SDValue V = lowerShuffleAsElementInsertion(
16551 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16552 return V;
16553 }
16554
16555 V1 = DAG.getBitcast(MVT::v8i16, V1);
16556 V2 = DAG.getBitcast(MVT::v8i16, V2);
16557 return DAG.getBitcast(MVT::v8f16,
16558 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
16559}
16560
16561// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
16562// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
16563// the active subvector is extracted.
16564static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
16565 ArrayRef<int> Mask, SDValue V1, SDValue V2,
16566 const X86Subtarget &Subtarget,
16567 SelectionDAG &DAG) {
16568 MVT MaskVT = VT.changeTypeToInteger();
16569 SDValue MaskNode;
16570 MVT ShuffleVT = VT;
16571 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
16572 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
16573 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
16574 ShuffleVT = V1.getSimpleValueType();
16575
16576 // Adjust mask to correct indices for the second input.
16577 int NumElts = VT.getVectorNumElements();
16578 unsigned Scale = 512 / VT.getSizeInBits();
16579 SmallVector<int, 32> AdjustedMask(Mask);
16580 for (int &M : AdjustedMask)
16581 if (NumElts <= M)
16582 M += (Scale - 1) * NumElts;
16583 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
16584 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
16585 } else {
16586 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
16587 }
16588
16589 SDValue Result;
16590 if (V2.isUndef())
16591 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
16592 else
16593 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
16594
16595 if (VT != ShuffleVT)
16596 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
16597
16598 return Result;
16599}
16600
16601/// Generic lowering of v16i8 shuffles.
16602///
16603/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
16604/// detect any complexity reducing interleaving. If that doesn't help, it uses
16605/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
16606/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
16607/// back together.
16608static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16609 const APInt &Zeroable, SDValue V1, SDValue V2,
16610 const X86Subtarget &Subtarget,
16611 SelectionDAG &DAG) {
16612 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16612, __extension__
__PRETTY_FUNCTION__))
;
16613 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16613, __extension__
__PRETTY_FUNCTION__))
;
16614 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16614, __extension__
__PRETTY_FUNCTION__))
;
16615
16616 // Try to use shift instructions.
16617 if (SDValue Shift =
16618 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
16619 DAG, /*BitwiseOnly*/ false))
16620 return Shift;
16621
16622 // Try to use byte rotation instructions.
16623 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
16624 Subtarget, DAG))
16625 return Rotate;
16626
16627 // Use dedicated pack instructions for masks that match their pattern.
16628 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
16629 Subtarget))
16630 return V;
16631
16632 // Try to use a zext lowering.
16633 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
16634 Zeroable, Subtarget, DAG))
16635 return ZExt;
16636
16637 // Try to use lower using a truncation.
16638 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16639 Subtarget, DAG))
16640 return V;
16641
16642 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16643 Subtarget, DAG))
16644 return V;
16645
16646 // See if we can use SSE4A Extraction / Insertion.
16647 if (Subtarget.hasSSE4A())
16648 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16649 Zeroable, DAG))
16650 return V;
16651
16652 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16653
16654 // For single-input shuffles, there are some nicer lowering tricks we can use.
16655 if (NumV2Elements == 0) {
16656 // Check for being able to broadcast a single element.
16657 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16658 Mask, Subtarget, DAG))
16659 return Broadcast;
16660
16661 // Try to use bit rotation instructions.
16662 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16663 Subtarget, DAG))
16664 return Rotate;
16665
16666 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16667 return V;
16668
16669 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16670 // Notably, this handles splat and partial-splat shuffles more efficiently.
16671 // However, it only makes sense if the pre-duplication shuffle simplifies
16672 // things significantly. Currently, this means we need to be able to
16673 // express the pre-duplication shuffle as an i16 shuffle.
16674 //
16675 // FIXME: We should check for other patterns which can be widened into an
16676 // i16 shuffle as well.
16677 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16678 for (int i = 0; i < 16; i += 2)
16679 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16680 return false;
16681
16682 return true;
16683 };
16684 auto tryToWidenViaDuplication = [&]() -> SDValue {
16685 if (!canWidenViaDuplication(Mask))
16686 return SDValue();
16687 SmallVector<int, 4> LoInputs;
16688 copy_if(Mask, std::back_inserter(LoInputs),
16689 [](int M) { return M >= 0 && M < 8; });
16690 array_pod_sort(LoInputs.begin(), LoInputs.end());
16691 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16692 LoInputs.end());
16693 SmallVector<int, 4> HiInputs;
16694 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16695 array_pod_sort(HiInputs.begin(), HiInputs.end());
16696 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16697 HiInputs.end());
16698
16699 bool TargetLo = LoInputs.size() >= HiInputs.size();
16700 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16701 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16702
16703 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16704 SmallDenseMap<int, int, 8> LaneMap;
16705 for (int I : InPlaceInputs) {
16706 PreDupI16Shuffle[I/2] = I/2;
16707 LaneMap[I] = I;
16708 }
16709 int j = TargetLo ? 0 : 4, je = j + 4;
16710 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16711 // Check if j is already a shuffle of this input. This happens when
16712 // there are two adjacent bytes after we move the low one.
16713 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16714 // If we haven't yet mapped the input, search for a slot into which
16715 // we can map it.
16716 while (j < je && PreDupI16Shuffle[j] >= 0)
16717 ++j;
16718
16719 if (j == je)
16720 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16721 return SDValue();
16722
16723 // Map this input with the i16 shuffle.
16724 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16725 }
16726
16727 // Update the lane map based on the mapping we ended up with.
16728 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16729 }
16730 V1 = DAG.getBitcast(
16731 MVT::v16i8,
16732 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16733 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16734
16735 // Unpack the bytes to form the i16s that will be shuffled into place.
16736 bool EvenInUse = false, OddInUse = false;
16737 for (int i = 0; i < 16; i += 2) {
16738 EvenInUse |= (Mask[i + 0] >= 0);
16739 OddInUse |= (Mask[i + 1] >= 0);
16740 if (EvenInUse && OddInUse)
16741 break;
16742 }
16743 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16744 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16745 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16746
16747 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16748 for (int i = 0; i < 16; ++i)
16749 if (Mask[i] >= 0) {
16750 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16751 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16751, __extension__
__PRETTY_FUNCTION__))
;
16752 if (PostDupI16Shuffle[i / 2] < 0)
16753 PostDupI16Shuffle[i / 2] = MappedMask;
16754 else
16755 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__
__PRETTY_FUNCTION__))
16756 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__
__PRETTY_FUNCTION__))
;
16757 }
16758 return DAG.getBitcast(
16759 MVT::v16i8,
16760 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16761 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16762 };
16763 if (SDValue V = tryToWidenViaDuplication())
16764 return V;
16765 }
16766
16767 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16768 Zeroable, Subtarget, DAG))
16769 return Masked;
16770
16771 // Use dedicated unpack instructions for masks that match their pattern.
16772 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16773 return V;
16774
16775 // Try to use byte shift instructions to mask.
16776 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16777 Zeroable, Subtarget, DAG))
16778 return V;
16779
16780 // Check for compaction patterns.
16781 bool IsSingleInput = V2.isUndef();
16782 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16783
16784 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16785 // with PSHUFB. It is important to do this before we attempt to generate any
16786 // blends but after all of the single-input lowerings. If the single input
16787 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16788 // want to preserve that and we can DAG combine any longer sequences into
16789 // a PSHUFB in the end. But once we start blending from multiple inputs,
16790 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16791 // and there are *very* few patterns that would actually be faster than the
16792 // PSHUFB approach because of its ability to zero lanes.
16793 //
16794 // If the mask is a binary compaction, we can more efficiently perform this
16795 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16796 //
16797 // FIXME: The only exceptions to the above are blends which are exact
16798 // interleavings with direct instructions supporting them. We currently don't
16799 // handle those well here.
16800 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16801 bool V1InUse = false;
16802 bool V2InUse = false;
16803
16804 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16805 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16806
16807 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16808 // do so. This avoids using them to handle blends-with-zero which is
16809 // important as a single pshufb is significantly faster for that.
16810 if (V1InUse && V2InUse) {
16811 if (Subtarget.hasSSE41())
16812 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16813 Zeroable, Subtarget, DAG))
16814 return Blend;
16815
16816 // We can use an unpack to do the blending rather than an or in some
16817 // cases. Even though the or may be (very minorly) more efficient, we
16818 // preference this lowering because there are common cases where part of
16819 // the complexity of the shuffles goes away when we do the final blend as
16820 // an unpack.
16821 // FIXME: It might be worth trying to detect if the unpack-feeding
16822 // shuffles will both be pshufb, in which case we shouldn't bother with
16823 // this.
16824 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16825 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16826 return Unpack;
16827
16828 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16829 if (Subtarget.hasVBMI())
16830 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16831 DAG);
16832
16833 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16834 if (Subtarget.hasXOP()) {
16835 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16836 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16837 }
16838
16839 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16840 // PALIGNR will be cheaper than the second PSHUFB+OR.
16841 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16842 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16843 return V;
16844 }
16845
16846 return PSHUFB;
16847 }
16848
16849 // There are special ways we can lower some single-element blends.
16850 if (NumV2Elements == 1)
16851 if (SDValue V = lowerShuffleAsElementInsertion(
16852 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16853 return V;
16854
16855 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16856 return Blend;
16857
16858 // Check whether a compaction lowering can be done. This handles shuffles
16859 // which take every Nth element for some even N. See the helper function for
16860 // details.
16861 //
16862 // We special case these as they can be particularly efficiently handled with
16863 // the PACKUSB instruction on x86 and they show up in common patterns of
16864 // rearranging bytes to truncate wide elements.
16865 if (NumEvenDrops) {
16866 // NumEvenDrops is the power of two stride of the elements. Another way of
16867 // thinking about it is that we need to drop the even elements this many
16868 // times to get the original input.
16869
16870 // First we need to zero all the dropped bytes.
16871 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__
__PRETTY_FUNCTION__))
16872 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__
__PRETTY_FUNCTION__))
;
16873 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16874 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16875 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16876 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16877 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16878 WordClearMask);
16879 if (!IsSingleInput)
16880 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16881 WordClearMask);
16882
16883 // Now pack things back together.
16884 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16885 IsSingleInput ? V1 : V2);
16886 for (int i = 1; i < NumEvenDrops; ++i) {
16887 Result = DAG.getBitcast(MVT::v8i16, Result);
16888 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16889 }
16890 return Result;
16891 }
16892
16893 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16894 if (NumOddDrops == 1) {
16895 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16896 DAG.getBitcast(MVT::v8i16, V1),
16897 DAG.getTargetConstant(8, DL, MVT::i8));
16898 if (!IsSingleInput)
16899 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16900 DAG.getBitcast(MVT::v8i16, V2),
16901 DAG.getTargetConstant(8, DL, MVT::i8));
16902 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16903 IsSingleInput ? V1 : V2);
16904 }
16905
16906 // Handle multi-input cases by blending/unpacking single-input shuffles.
16907 if (NumV2Elements > 0)
16908 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16909 Subtarget, DAG);
16910
16911 // The fallback path for single-input shuffles widens this into two v8i16
16912 // vectors with unpacks, shuffles those, and then pulls them back together
16913 // with a pack.
16914 SDValue V = V1;
16915
16916 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16917 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16918 for (int i = 0; i < 16; ++i)
16919 if (Mask[i] >= 0)
16920 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16921
16922 SDValue VLoHalf, VHiHalf;
16923 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16924 // them out and avoid using UNPCK{L,H} to extract the elements of V as
16925 // i16s.
16926 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16927 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16928 // Use a mask to drop the high bytes.
16929 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16930 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16931 DAG.getConstant(0x00FF, DL, MVT::v8i16));
16932
16933 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16934 VHiHalf = DAG.getUNDEF(MVT::v8i16);
16935
16936 // Squash the masks to point directly into VLoHalf.
16937 for (int &M : LoBlendMask)
16938 if (M >= 0)
16939 M /= 2;
16940 for (int &M : HiBlendMask)
16941 if (M >= 0)
16942 M /= 2;
16943 } else {
16944 // Otherwise just unpack the low half of V into VLoHalf and the high half into
16945 // VHiHalf so that we can blend them as i16s.
16946 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16947
16948 VLoHalf = DAG.getBitcast(
16949 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16950 VHiHalf = DAG.getBitcast(
16951 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16952 }
16953
16954 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16955 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16956
16957 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16958}
16959
16960/// Dispatching routine to lower various 128-bit x86 vector shuffles.
16961///
16962/// This routine breaks down the specific type of 128-bit shuffle and
16963/// dispatches to the lowering routines accordingly.
16964static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16965 MVT VT, SDValue V1, SDValue V2,
16966 const APInt &Zeroable,
16967 const X86Subtarget &Subtarget,
16968 SelectionDAG &DAG) {
16969 switch (VT.SimpleTy) {
16970 case MVT::v2i64:
16971 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16972 case MVT::v2f64:
16973 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16974 case MVT::v4i32:
16975 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16976 case MVT::v4f32:
16977 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16978 case MVT::v8i16:
16979 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16980 case MVT::v8f16:
16981 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16982 case MVT::v16i8:
16983 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16984
16985 default:
16986 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16986)
;
16987 }
16988}
16989
16990/// Generic routine to split vector shuffle into half-sized shuffles.
16991///
16992/// This routine just extracts two subvectors, shuffles them independently, and
16993/// then concatenates them back together. This should work effectively with all
16994/// AVX vector shuffle types.
16995static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16996 SDValue V2, ArrayRef<int> Mask,
16997 SelectionDAG &DAG, bool SimpleOnly) {
16998 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__
__PRETTY_FUNCTION__))
16999 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__
__PRETTY_FUNCTION__))
;
17000 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17000, __extension__
__PRETTY_FUNCTION__))
;
17001 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17001, __extension__
__PRETTY_FUNCTION__))
;
17002
17003 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
17004 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
17005
17006 int NumElements = VT.getVectorNumElements();
17007 int SplitNumElements = NumElements / 2;
17008 MVT ScalarVT = VT.getVectorElementType();
17009 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
17010
17011 // Use splitVector/extractSubVector so that split build-vectors just build two
17012 // narrower build vectors. This helps shuffling with splats and zeros.
17013 auto SplitVector = [&](SDValue V) {
17014 SDValue LoV, HiV;
17015 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
17016 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
17017 DAG.getBitcast(SplitVT, HiV));
17018 };
17019
17020 SDValue LoV1, HiV1, LoV2, HiV2;
17021 std::tie(LoV1, HiV1) = SplitVector(V1);
17022 std::tie(LoV2, HiV2) = SplitVector(V2);
17023
17024 // Now create two 4-way blends of these half-width vectors.
17025 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
17026 bool &UseHiV1, bool &UseLoV2,
17027 bool &UseHiV2) {
17028 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
17029 for (int i = 0; i < SplitNumElements; ++i) {
17030 int M = HalfMask[i];
17031 if (M >= NumElements) {
17032 if (M >= NumElements + SplitNumElements)
17033 UseHiV2 = true;
17034 else
17035 UseLoV2 = true;
17036 } else if (M >= 0) {
17037 if (M >= SplitNumElements)
17038 UseHiV1 = true;
17039 else
17040 UseLoV1 = true;
17041 }
17042 }
17043 };
17044
17045 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
17046 if (!SimpleOnly)
17047 return true;
17048
17049 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
17050 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
17051
17052 return !(UseHiV1 || UseHiV2);
17053 };
17054
17055 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
17056 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
17057 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
17058 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
17059 for (int i = 0; i < SplitNumElements; ++i) {
17060 int M = HalfMask[i];
17061 if (M >= NumElements) {
17062 V2BlendMask[i] = M - NumElements;
17063 BlendMask[i] = SplitNumElements + i;
17064 } else if (M >= 0) {
17065 V1BlendMask[i] = M;
17066 BlendMask[i] = i;
17067 }
17068 }
17069
17070 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
17071 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
17072
17073 // Because the lowering happens after all combining takes place, we need to
17074 // manually combine these blend masks as much as possible so that we create
17075 // a minimal number of high-level vector shuffle nodes.
17076 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 &&
!UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail
("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17076, __extension__
__PRETTY_FUNCTION__))
;
17077
17078 // First try just blending the halves of V1 or V2.
17079 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
17080 return DAG.getUNDEF(SplitVT);
17081 if (!UseLoV2 && !UseHiV2)
17082 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17083 if (!UseLoV1 && !UseHiV1)
17084 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17085
17086 SDValue V1Blend, V2Blend;
17087 if (UseLoV1 && UseHiV1) {
17088 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17089 } else {
17090 // We only use half of V1 so map the usage down into the final blend mask.
17091 V1Blend = UseLoV1 ? LoV1 : HiV1;
17092 for (int i = 0; i < SplitNumElements; ++i)
17093 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
17094 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
17095 }
17096 if (UseLoV2 && UseHiV2) {
17097 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17098 } else {
17099 // We only use half of V2 so map the usage down into the final blend mask.
17100 V2Blend = UseLoV2 ? LoV2 : HiV2;
17101 for (int i = 0; i < SplitNumElements; ++i)
17102 if (BlendMask[i] >= SplitNumElements)
17103 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
17104 }
17105 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
17106 };
17107
17108 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
17109 return SDValue();
17110
17111 SDValue Lo = HalfBlend(LoMask);
17112 SDValue Hi = HalfBlend(HiMask);
17113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17114}
17115
17116/// Either split a vector in halves or decompose the shuffles and the
17117/// blend/unpack.
17118///
17119/// This is provided as a good fallback for many lowerings of non-single-input
17120/// shuffles with more than one 128-bit lane. In those cases, we want to select
17121/// between splitting the shuffle into 128-bit components and stitching those
17122/// back together vs. extracting the single-input shuffles and blending those
17123/// results.
17124static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
17125 SDValue V2, ArrayRef<int> Mask,
17126 const X86Subtarget &Subtarget,
17127 SelectionDAG &DAG) {
17128 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__
__PRETTY_FUNCTION__))
17129 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__
__PRETTY_FUNCTION__))
;
17130 int Size = Mask.size();
17131
17132 // If this can be modeled as a broadcast of two elements followed by a blend,
17133 // prefer that lowering. This is especially important because broadcasts can
17134 // often fold with memory operands.
17135 auto DoBothBroadcast = [&] {
17136 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
17137 for (int M : Mask)
17138 if (M >= Size) {
17139 if (V2BroadcastIdx < 0)
17140 V2BroadcastIdx = M - Size;
17141 else if (M - Size != V2BroadcastIdx)
17142 return false;
17143 } else if (M >= 0) {
17144 if (V1BroadcastIdx < 0)
17145 V1BroadcastIdx = M;
17146 else if (M != V1BroadcastIdx)
17147 return false;
17148 }
17149 return true;
17150 };
17151 if (DoBothBroadcast())
17152 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17153 DAG);
17154
17155 // If the inputs all stem from a single 128-bit lane of each input, then we
17156 // split them rather than blending because the split will decompose to
17157 // unusually few instructions.
17158 int LaneCount = VT.getSizeInBits() / 128;
17159 int LaneSize = Size / LaneCount;
17160 SmallBitVector LaneInputs[2];
17161 LaneInputs[0].resize(LaneCount, false);
17162 LaneInputs[1].resize(LaneCount, false);
17163 for (int i = 0; i < Size; ++i)
17164 if (Mask[i] >= 0)
17165 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
17166 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
17167 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17168 /*SimpleOnly*/ false);
17169
17170 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
17171 // requires that the decomposed single-input shuffles don't end up here.
17172 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17173 DAG);
17174}
17175
17176// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17177// TODO: Extend to support v8f32 (+ 512-bit shuffles).
17178static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
17179 SDValue V1, SDValue V2,
17180 ArrayRef<int> Mask,
17181 SelectionDAG &DAG) {
17182 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17182, __extension__
__PRETTY_FUNCTION__))
;
17183
17184 int LHSMask[4] = {-1, -1, -1, -1};
17185 int RHSMask[4] = {-1, -1, -1, -1};
17186 unsigned SHUFPMask = 0;
17187
17188 // As SHUFPD uses a single LHS/RHS element per lane, we can always
17189 // perform the shuffle once the lanes have been shuffled in place.
17190 for (int i = 0; i != 4; ++i) {
17191 int M = Mask[i];
17192 if (M < 0)
17193 continue;
17194 int LaneBase = i & ~1;
17195 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
17196 LaneMask[LaneBase + (M & 1)] = M;
17197 SHUFPMask |= (M & 1) << i;
17198 }
17199
17200 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
17201 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
17202 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
17203 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
17204}
17205
17206/// Lower a vector shuffle crossing multiple 128-bit lanes as
17207/// a lane permutation followed by a per-lane permutation.
17208///
17209/// This is mainly for cases where we can have non-repeating permutes
17210/// in each lane.
17211///
17212/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
17213/// we should investigate merging them.
17214static SDValue lowerShuffleAsLanePermuteAndPermute(
17215 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17216 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17217 int NumElts = VT.getVectorNumElements();
17218 int NumLanes = VT.getSizeInBits() / 128;
17219 int NumEltsPerLane = NumElts / NumLanes;
17220 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
17221
17222 /// Attempts to find a sublane permute with the given size
17223 /// that gets all elements into their target lanes.
17224 ///
17225 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
17226 /// If unsuccessful, returns false and may overwrite InLaneMask.
17227 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
17228 int NumSublanesPerLane = NumSublanes / NumLanes;
17229 int NumEltsPerSublane = NumElts / NumSublanes;
17230
17231 SmallVector<int, 16> CrossLaneMask;
17232 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
17233 // CrossLaneMask but one entry == one sublane.
17234 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
17235
17236 for (int i = 0; i != NumElts; ++i) {
17237 int M = Mask[i];
17238 if (M < 0)
17239 continue;
17240
17241 int SrcSublane = M / NumEltsPerSublane;
17242 int DstLane = i / NumEltsPerLane;
17243
17244 // We only need to get the elements into the right lane, not sublane.
17245 // So search all sublanes that make up the destination lane.
17246 bool Found = false;
17247 int DstSubStart = DstLane * NumSublanesPerLane;
17248 int DstSubEnd = DstSubStart + NumSublanesPerLane;
17249 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
17250 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
17251 continue;
17252
17253 Found = true;
17254 CrossLaneMaskLarge[DstSublane] = SrcSublane;
17255 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
17256 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
17257 break;
17258 }
17259 if (!Found)
17260 return SDValue();
17261 }
17262
17263 // Fill CrossLaneMask using CrossLaneMaskLarge.
17264 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
17265
17266 if (!CanUseSublanes) {
17267 // If we're only shuffling a single lowest lane and the rest are identity
17268 // then don't bother.
17269 // TODO - isShuffleMaskInputInPlace could be extended to something like
17270 // this.
17271 int NumIdentityLanes = 0;
17272 bool OnlyShuffleLowestLane = true;
17273 for (int i = 0; i != NumLanes; ++i) {
17274 int LaneOffset = i * NumEltsPerLane;
17275 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
17276 i * NumEltsPerLane))
17277 NumIdentityLanes++;
17278 else if (CrossLaneMask[LaneOffset] != 0)
17279 OnlyShuffleLowestLane = false;
17280 }
17281 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
17282 return SDValue();
17283 }
17284
17285 // Avoid returning the same shuffle operation. For example,
17286 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
17287 // undef:v16i16
17288 if (CrossLaneMask == Mask || InLaneMask == Mask)
17289 return SDValue();
17290
17291 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
17292 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
17293 InLaneMask);
17294 };
17295
17296 // First attempt a solution with full lanes.
17297 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
17298 return V;
17299
17300 // The rest of the solutions use sublanes.
17301 if (!CanUseSublanes)
17302 return SDValue();
17303
17304 // Then attempt a solution with 64-bit sublanes (vpermq).
17305 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
17306 return V;
17307
17308 // If that doesn't work and we have fast variable cross-lane shuffle,
17309 // attempt 32-bit sublanes (vpermd).
17310 if (!Subtarget.hasFastVariableCrossLaneShuffle())
17311 return SDValue();
17312
17313 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
17314}
17315
17316/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
17317static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
17318 SmallVector<int> &InLaneMask) {
17319 int Size = Mask.size();
17320 InLaneMask.assign(Mask.begin(), Mask.end());
17321 for (int i = 0; i < Size; ++i) {
17322 int &M = InLaneMask[i];
17323 if (M < 0)
17324 continue;
17325 if (((M % Size) / LaneSize) != (i / LaneSize))
17326 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17327 }
17328}
17329
17330/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
17331/// source with a lane permutation.
17332///
17333/// This lowering strategy results in four instructions in the worst case for a
17334/// single-input cross lane shuffle which is lower than any other fully general
17335/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
17336/// shuffle pattern should be handled prior to trying this lowering.
17337static SDValue lowerShuffleAsLanePermuteAndShuffle(
17338 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17339 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17340 // FIXME: This should probably be generalized for 512-bit vectors as well.
17341 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17341, __extension__
__PRETTY_FUNCTION__))
;
17342 int Size = Mask.size();
17343 int LaneSize = Size / 2;
17344
17345 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17346 // Only do this if the elements aren't all from the lower lane,
17347 // otherwise we're (probably) better off doing a split.
17348 if (VT == MVT::v4f64 &&
17349 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
17350 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
17351
17352 // If there are only inputs from one 128-bit lane, splitting will in fact be
17353 // less expensive. The flags track whether the given lane contains an element
17354 // that crosses to another lane.
17355 bool AllLanes;
17356 if (!Subtarget.hasAVX2()) {
17357 bool LaneCrossing[2] = {false, false};
17358 for (int i = 0; i < Size; ++i)
17359 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
17360 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
17361 AllLanes = LaneCrossing[0] && LaneCrossing[1];
17362 } else {
17363 bool LaneUsed[2] = {false, false};
17364 for (int i = 0; i < Size; ++i)
17365 if (Mask[i] >= 0)
17366 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
17367 AllLanes = LaneUsed[0] && LaneUsed[1];
17368 }
17369
17370 // TODO - we could support shuffling V2 in the Flipped input.
17371 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__
__PRETTY_FUNCTION__))
17372 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__
__PRETTY_FUNCTION__))
;
17373
17374 SmallVector<int> InLaneMask;
17375 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17376
17377 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__
__PRETTY_FUNCTION__))
17378 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__
__PRETTY_FUNCTION__))
;
17379
17380 // If we're not using both lanes in each lane and the inlane mask is not
17381 // repeating, then we're better off splitting.
17382 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17383 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17384 /*SimpleOnly*/ false);
17385
17386 // Flip the lanes, and shuffle the results which should now be in-lane.
17387 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
17388 SDValue Flipped = DAG.getBitcast(PVT, V1);
17389 Flipped =
17390 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
17391 Flipped = DAG.getBitcast(VT, Flipped);
17392 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
17393}
17394
17395/// Handle lowering 2-lane 128-bit shuffles.
17396static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
17397 SDValue V2, ArrayRef<int> Mask,
17398 const APInt &Zeroable,
17399 const X86Subtarget &Subtarget,
17400 SelectionDAG &DAG) {
17401 if (V2.isUndef()) {
17402 // Attempt to match VBROADCAST*128 subvector broadcast load.
17403 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
17404 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
17405 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
17406 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
17407 MVT MemVT = VT.getHalfNumVectorElementsVT();
17408 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
17409 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
17410 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
17411 VT, MemVT, Ld, Ofs, DAG))
17412 return BcstLd;
17413 }
17414
17415 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
17416 if (Subtarget.hasAVX2())
17417 return SDValue();
17418 }
17419
17420 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
17421
17422 SmallVector<int, 4> WidenedMask;
17423 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
17424 return SDValue();
17425
17426 bool IsLowZero = (Zeroable & 0x3) == 0x3;
17427 bool IsHighZero = (Zeroable & 0xc) == 0xc;
17428
17429 // Try to use an insert into a zero vector.
17430 if (WidenedMask[0] == 0 && IsHighZero) {
17431 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17432 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17433 DAG.getIntPtrConstant(0, DL));
17434 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17435 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17436 DAG.getIntPtrConstant(0, DL));
17437 }
17438
17439 // TODO: If minimizing size and one of the inputs is a zero vector and the
17440 // the zero vector has only one use, we could use a VPERM2X128 to save the
17441 // instruction bytes needed to explicitly generate the zero vector.
17442
17443 // Blends are faster and handle all the non-lane-crossing cases.
17444 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
17445 Subtarget, DAG))
17446 return Blend;
17447
17448 // If either input operand is a zero vector, use VPERM2X128 because its mask
17449 // allows us to replace the zero input with an implicit zero.
17450 if (!IsLowZero && !IsHighZero) {
17451 // Check for patterns which can be matched with a single insert of a 128-bit
17452 // subvector.
17453 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
17454 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
17455
17456 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
17457 // this will likely become vinsertf128 which can't fold a 256-bit memop.
17458 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
17459 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17460 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
17461 OnlyUsesV1 ? V1 : V2,
17462 DAG.getIntPtrConstant(0, DL));
17463 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17464 DAG.getIntPtrConstant(2, DL));
17465 }
17466 }
17467
17468 // Try to use SHUF128 if possible.
17469 if (Subtarget.hasVLX()) {
17470 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
17471 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
17472 ((WidenedMask[1] % 2) << 1);
17473 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
17474 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17475 }
17476 }
17477 }
17478
17479 // Otherwise form a 128-bit permutation. After accounting for undefs,
17480 // convert the 64-bit shuffle mask selection values into 128-bit
17481 // selection bits by dividing the indexes by 2 and shifting into positions
17482 // defined by a vperm2*128 instruction's immediate control byte.
17483
17484 // The immediate permute control byte looks like this:
17485 // [1:0] - select 128 bits from sources for low half of destination
17486 // [2] - ignore
17487 // [3] - zero low half of destination
17488 // [5:4] - select 128 bits from sources for high half of destination
17489 // [6] - ignore
17490 // [7] - zero high half of destination
17491
17492 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__
__PRETTY_FUNCTION__))
17493 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__
__PRETTY_FUNCTION__))
;
17494
17495 unsigned PermMask = 0;
17496 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
17497 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
17498
17499 // Check the immediate mask and replace unused sources with undef.
17500 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
17501 V1 = DAG.getUNDEF(VT);
17502 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
17503 V2 = DAG.getUNDEF(VT);
17504
17505 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
17506 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17507}
17508
17509/// Lower a vector shuffle by first fixing the 128-bit lanes and then
17510/// shuffling each lane.
17511///
17512/// This attempts to create a repeated lane shuffle where each lane uses one
17513/// or two of the lanes of the inputs. The lanes of the input vectors are
17514/// shuffled in one or two independent shuffles to get the lanes into the
17515/// position needed by the final shuffle.
17516static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
17517 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17518 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17519 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17519, __extension__
__PRETTY_FUNCTION__))
;
17520
17521 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17522 return SDValue();
17523
17524 int NumElts = Mask.size();
17525 int NumLanes = VT.getSizeInBits() / 128;
17526 int NumLaneElts = 128 / VT.getScalarSizeInBits();
17527 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
17528 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
17529
17530 // First pass will try to fill in the RepeatMask from lanes that need two
17531 // sources.
17532 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17533 int Srcs[2] = {-1, -1};
17534 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
17535 for (int i = 0; i != NumLaneElts; ++i) {
17536 int M = Mask[(Lane * NumLaneElts) + i];
17537 if (M < 0)
17538 continue;
17539 // Determine which of the possible input lanes (NumLanes from each source)
17540 // this element comes from. Assign that as one of the sources for this
17541 // lane. We can assign up to 2 sources for this lane. If we run out
17542 // sources we can't do anything.
17543 int LaneSrc = M / NumLaneElts;
17544 int Src;
17545 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
17546 Src = 0;
17547 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
17548 Src = 1;
17549 else
17550 return SDValue();
17551
17552 Srcs[Src] = LaneSrc;
17553 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
17554 }
17555
17556 // If this lane has two sources, see if it fits with the repeat mask so far.
17557 if (Srcs[1] < 0)
17558 continue;
17559
17560 LaneSrcs[Lane][0] = Srcs[0];
17561 LaneSrcs[Lane][1] = Srcs[1];
17562
17563 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
17564 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17564, __extension__
__PRETTY_FUNCTION__))
;
17565 for (int i = 0, e = M1.size(); i != e; ++i)
17566 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
17567 return false;
17568 return true;
17569 };
17570
17571 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
17572 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17572, __extension__
__PRETTY_FUNCTION__))
;
17573 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
17574 int M = Mask[i];
17575 if (M < 0)
17576 continue;
17577 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__
__PRETTY_FUNCTION__))
17578 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__
__PRETTY_FUNCTION__))
;
17579 MergedMask[i] = M;
17580 }
17581 };
17582
17583 if (MatchMasks(InLaneMask, RepeatMask)) {
17584 // Merge this lane mask into the final repeat mask.
17585 MergeMasks(InLaneMask, RepeatMask);
17586 continue;
17587 }
17588
17589 // Didn't find a match. Swap the operands and try again.
17590 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
17591 ShuffleVectorSDNode::commuteMask(InLaneMask);
17592
17593 if (MatchMasks(InLaneMask, RepeatMask)) {
17594 // Merge this lane mask into the final repeat mask.
17595 MergeMasks(InLaneMask, RepeatMask);
17596 continue;
17597 }
17598
17599 // Couldn't find a match with the operands in either order.
17600 return SDValue();
17601 }
17602
17603 // Now handle any lanes with only one source.
17604 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17605 // If this lane has already been processed, skip it.
17606 if (LaneSrcs[Lane][0] >= 0)
17607 continue;
17608
17609 for (int i = 0; i != NumLaneElts; ++i) {
17610 int M = Mask[(Lane * NumLaneElts) + i];
17611 if (M < 0)
17612 continue;
17613
17614 // If RepeatMask isn't defined yet we can define it ourself.
17615 if (RepeatMask[i] < 0)
17616 RepeatMask[i] = M % NumLaneElts;
17617
17618 if (RepeatMask[i] < NumElts) {
17619 if (RepeatMask[i] != M % NumLaneElts)
17620 return SDValue();
17621 LaneSrcs[Lane][0] = M / NumLaneElts;
17622 } else {
17623 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
17624 return SDValue();
17625 LaneSrcs[Lane][1] = M / NumLaneElts;
17626 }
17627 }
17628
17629 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
17630 return SDValue();
17631 }
17632
17633 SmallVector<int, 16> NewMask(NumElts, -1);
17634 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17635 int Src = LaneSrcs[Lane][0];
17636 for (int i = 0; i != NumLaneElts; ++i) {
17637 int M = -1;
17638 if (Src >= 0)
17639 M = Src * NumLaneElts + i;
17640 NewMask[Lane * NumLaneElts + i] = M;
17641 }
17642 }
17643 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17644 // Ensure we didn't get back the shuffle we started with.
17645 // FIXME: This is a hack to make up for some splat handling code in
17646 // getVectorShuffle.
17647 if (isa<ShuffleVectorSDNode>(NewV1) &&
17648 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
17649 return SDValue();
17650
17651 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17652 int Src = LaneSrcs[Lane][1];
17653 for (int i = 0; i != NumLaneElts; ++i) {
17654 int M = -1;
17655 if (Src >= 0)
17656 M = Src * NumLaneElts + i;
17657 NewMask[Lane * NumLaneElts + i] = M;
17658 }
17659 }
17660 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17661 // Ensure we didn't get back the shuffle we started with.
17662 // FIXME: This is a hack to make up for some splat handling code in
17663 // getVectorShuffle.
17664 if (isa<ShuffleVectorSDNode>(NewV2) &&
17665 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
17666 return SDValue();
17667
17668 for (int i = 0; i != NumElts; ++i) {
17669 if (Mask[i] < 0) {
17670 NewMask[i] = -1;
17671 continue;
17672 }
17673 NewMask[i] = RepeatMask[i % NumLaneElts];
17674 if (NewMask[i] < 0)
17675 continue;
17676
17677 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
17678 }
17679 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
17680}
17681
17682/// If the input shuffle mask results in a vector that is undefined in all upper
17683/// or lower half elements and that mask accesses only 2 halves of the
17684/// shuffle's operands, return true. A mask of half the width with mask indexes
17685/// adjusted to access the extracted halves of the original shuffle operands is
17686/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17687/// lower half of each input operand is accessed.
17688static bool
17689getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17690 int &HalfIdx1, int &HalfIdx2) {
17691 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__
__PRETTY_FUNCTION__))
17692 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__
__PRETTY_FUNCTION__))
;
17693
17694 // Exactly one half of the result must be undef to allow narrowing.
17695 bool UndefLower = isUndefLowerHalf(Mask);
17696 bool UndefUpper = isUndefUpperHalf(Mask);
17697 if (UndefLower == UndefUpper)
17698 return false;
17699
17700 unsigned HalfNumElts = HalfMask.size();
17701 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17702 HalfIdx1 = -1;
17703 HalfIdx2 = -1;
17704 for (unsigned i = 0; i != HalfNumElts; ++i) {
17705 int M = Mask[i + MaskIndexOffset];
17706 if (M < 0) {
17707 HalfMask[i] = M;
17708 continue;
17709 }
17710
17711 // Determine which of the 4 half vectors this element is from.
17712 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17713 int HalfIdx = M / HalfNumElts;
17714
17715 // Determine the element index into its half vector source.
17716 int HalfElt = M % HalfNumElts;
17717
17718 // We can shuffle with up to 2 half vectors, set the new 'half'
17719 // shuffle mask accordingly.
17720 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17721 HalfMask[i] = HalfElt;
17722 HalfIdx1 = HalfIdx;
17723 continue;
17724 }
17725 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17726 HalfMask[i] = HalfElt + HalfNumElts;
17727 HalfIdx2 = HalfIdx;
17728 continue;
17729 }
17730
17731 // Too many half vectors referenced.
17732 return false;
17733 }
17734
17735 return true;
17736}
17737
17738/// Given the output values from getHalfShuffleMask(), create a half width
17739/// shuffle of extracted vectors followed by an insert back to full width.
17740static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17741 ArrayRef<int> HalfMask, int HalfIdx1,
17742 int HalfIdx2, bool UndefLower,
17743 SelectionDAG &DAG, bool UseConcat = false) {
17744 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17744, __extension__
__PRETTY_FUNCTION__))
;
17745 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17745, __extension__
__PRETTY_FUNCTION__))
;
17746
17747 MVT VT = V1.getSimpleValueType();
17748 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17749 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17750
17751 auto getHalfVector = [&](int HalfIdx) {
17752 if (HalfIdx < 0)
17753 return DAG.getUNDEF(HalfVT);
17754 SDValue V = (HalfIdx < 2 ? V1 : V2);
17755 HalfIdx = (HalfIdx % 2) * HalfNumElts;
17756 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17757 DAG.getIntPtrConstant(HalfIdx, DL));
17758 };
17759
17760 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17761 SDValue Half1 = getHalfVector(HalfIdx1);
17762 SDValue Half2 = getHalfVector(HalfIdx2);
17763 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17764 if (UseConcat) {
17765 SDValue Op0 = V;
17766 SDValue Op1 = DAG.getUNDEF(HalfVT);
17767 if (UndefLower)
17768 std::swap(Op0, Op1);
17769 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17770 }
17771
17772 unsigned Offset = UndefLower ? HalfNumElts : 0;
17773 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17774 DAG.getIntPtrConstant(Offset, DL));
17775}
17776
17777/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17778/// This allows for fast cases such as subvector extraction/insertion
17779/// or shuffling smaller vector types which can lower more efficiently.
17780static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17781 SDValue V2, ArrayRef<int> Mask,
17782 const X86Subtarget &Subtarget,
17783 SelectionDAG &DAG) {
17784 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__
__PRETTY_FUNCTION__))
17785 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__
__PRETTY_FUNCTION__))
;
17786
17787 bool UndefLower = isUndefLowerHalf(Mask);
17788 if (!UndefLower && !isUndefUpperHalf(Mask))
17789 return SDValue();
17790
17791 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__
__PRETTY_FUNCTION__))
17792 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__
__PRETTY_FUNCTION__))
;
17793
17794 // Upper half is undef and lower half is whole upper subvector.
17795 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17796 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17797 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17798 if (!UndefLower &&
17799 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17800 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17801 DAG.getIntPtrConstant(HalfNumElts, DL));
17802 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17803 DAG.getIntPtrConstant(0, DL));
17804 }
17805
17806 // Lower half is undef and upper half is whole lower subvector.
17807 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17808 if (UndefLower &&
17809 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17810 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17811 DAG.getIntPtrConstant(0, DL));
17812 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17813 DAG.getIntPtrConstant(HalfNumElts, DL));
17814 }
17815
17816 int HalfIdx1, HalfIdx2;
17817 SmallVector<int, 8> HalfMask(HalfNumElts);
17818 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17819 return SDValue();
17820
17821 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17821, __extension__
__PRETTY_FUNCTION__))
;
17822
17823 // Only shuffle the halves of the inputs when useful.
17824 unsigned NumLowerHalves =
17825 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17826 unsigned NumUpperHalves =
17827 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17828 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17828, __extension__
__PRETTY_FUNCTION__))
;
17829
17830 // Determine the larger pattern of undef/halves, then decide if it's worth
17831 // splitting the shuffle based on subtarget capabilities and types.
17832 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17833 if (!UndefLower) {
17834 // XXXXuuuu: no insert is needed.
17835 // Always extract lowers when setting lower - these are all free subreg ops.
17836 if (NumUpperHalves == 0)
17837 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17838 UndefLower, DAG);
17839
17840 if (NumUpperHalves == 1) {
17841 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17842 if (Subtarget.hasAVX2()) {
17843 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17844 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17845 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
17846 (!isSingleSHUFPSMask(HalfMask) ||
17847 Subtarget.hasFastVariableCrossLaneShuffle()))
17848 return SDValue();
17849 // If this is a unary shuffle (assume that the 2nd operand is
17850 // canonicalized to undef), then we can use vpermpd. Otherwise, we
17851 // are better off extracting the upper half of 1 operand and using a
17852 // narrow shuffle.
17853 if (EltWidth == 64 && V2.isUndef())
17854 return SDValue();
17855 }
17856 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17857 if (Subtarget.hasAVX512() && VT.is512BitVector())
17858 return SDValue();
17859 // Extract + narrow shuffle is better than the wide alternative.
17860 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17861 UndefLower, DAG);
17862 }
17863
17864 // Don't extract both uppers, instead shuffle and then extract.
17865 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17865, __extension__
__PRETTY_FUNCTION__))
;
17866 return SDValue();
17867 }
17868
17869 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17870 if (NumUpperHalves == 0) {
17871 // AVX2 has efficient 64-bit element cross-lane shuffles.
17872 // TODO: Refine to account for unary shuffle, splat, and other masks?
17873 if (Subtarget.hasAVX2() && EltWidth == 64)
17874 return SDValue();
17875 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17876 if (Subtarget.hasAVX512() && VT.is512BitVector())
17877 return SDValue();
17878 // Narrow shuffle + insert is better than the wide alternative.
17879 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17880 UndefLower, DAG);
17881 }
17882
17883 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17884 return SDValue();
17885}
17886
17887/// Handle case where shuffle sources are coming from the same 128-bit lane and
17888/// every lane can be represented as the same repeating mask - allowing us to
17889/// shuffle the sources with the repeating shuffle and then permute the result
17890/// to the destination lanes.
17891static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17892 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17893 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17894 int NumElts = VT.getVectorNumElements();
17895 int NumLanes = VT.getSizeInBits() / 128;
17896 int NumLaneElts = NumElts / NumLanes;
17897
17898 // On AVX2 we may be able to just shuffle the lowest elements and then
17899 // broadcast the result.
17900 if (Subtarget.hasAVX2()) {
17901 for (unsigned BroadcastSize : {16, 32, 64}) {
17902 if (BroadcastSize <= VT.getScalarSizeInBits())
17903 continue;
17904 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17905
17906 // Attempt to match a repeating pattern every NumBroadcastElts,
17907 // accounting for UNDEFs but only references the lowest 128-bit
17908 // lane of the inputs.
17909 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17910 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17911 for (int j = 0; j != NumBroadcastElts; ++j) {
17912 int M = Mask[i + j];
17913 if (M < 0)
17914 continue;
17915 int &R = RepeatMask[j];
17916 if (0 != ((M % NumElts) / NumLaneElts))
17917 return false;
17918 if (0 <= R && R != M)
17919 return false;
17920 R = M;
17921 }
17922 return true;
17923 };
17924
17925 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17926 if (!FindRepeatingBroadcastMask(RepeatMask))
17927 continue;
17928
17929 // Shuffle the (lowest) repeated elements in place for broadcast.
17930 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17931
17932 // Shuffle the actual broadcast.
17933 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17934 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17935 for (int j = 0; j != NumBroadcastElts; ++j)
17936 BroadcastMask[i + j] = j;
17937 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17938 BroadcastMask);
17939 }
17940 }
17941
17942 // Bail if the shuffle mask doesn't cross 128-bit lanes.
17943 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17944 return SDValue();
17945
17946 // Bail if we already have a repeated lane shuffle mask.
17947 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17948 return SDValue();
17949
17950 // Helper to look for repeated mask in each split sublane, and that those
17951 // sublanes can then be permuted into place.
17952 auto ShuffleSubLanes = [&](int SubLaneScale) {
17953 int NumSubLanes = NumLanes * SubLaneScale;
17954 int NumSubLaneElts = NumLaneElts / SubLaneScale;
17955
17956 // Check that all the sources are coming from the same lane and see if we
17957 // can form a repeating shuffle mask (local to each sub-lane). At the same
17958 // time, determine the source sub-lane for each destination sub-lane.
17959 int TopSrcSubLane = -1;
17960 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17961 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
17962 SubLaneScale,
17963 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
17964
17965 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17966 // Extract the sub-lane mask, check that it all comes from the same lane
17967 // and normalize the mask entries to come from the first lane.
17968 int SrcLane = -1;
17969 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17970 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17971 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17972 if (M < 0)
17973 continue;
17974 int Lane = (M % NumElts) / NumLaneElts;
17975 if ((0 <= SrcLane) && (SrcLane != Lane))
17976 return SDValue();
17977 SrcLane = Lane;
17978 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17979 SubLaneMask[Elt] = LocalM;
17980 }
17981
17982 // Whole sub-lane is UNDEF.
17983 if (SrcLane < 0)
17984 continue;
17985
17986 // Attempt to match against the candidate repeated sub-lane masks.
17987 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17988 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17989 for (int i = 0; i != NumSubLaneElts; ++i) {
17990 if (M1[i] < 0 || M2[i] < 0)
17991 continue;
17992 if (M1[i] != M2[i])
17993 return false;
17994 }
17995 return true;
17996 };
17997
17998 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17999 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
18000 continue;
18001
18002 // Merge the sub-lane mask into the matching repeated sub-lane mask.
18003 for (int i = 0; i != NumSubLaneElts; ++i) {
18004 int M = SubLaneMask[i];
18005 if (M < 0)
18006 continue;
18007 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__
__PRETTY_FUNCTION__))
18008 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__
__PRETTY_FUNCTION__))
;
18009 RepeatedSubLaneMask[i] = M;
18010 }
18011
18012 // Track the top most source sub-lane - by setting the remaining to
18013 // UNDEF we can greatly simplify shuffle matching.
18014 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
18015 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
18016 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
18017 break;
18018 }
18019
18020 // Bail if we failed to find a matching repeated sub-lane mask.
18021 if (Dst2SrcSubLanes[DstSubLane] < 0)
18022 return SDValue();
18023 }
18024 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__
__PRETTY_FUNCTION__))
18025 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__
__PRETTY_FUNCTION__))
;
18026
18027 // Create a repeating shuffle mask for the entire vector.
18028 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
18029 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
18030 int Lane = SubLane / SubLaneScale;
18031 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
18032 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
18033 int M = RepeatedSubLaneMask[Elt];
18034 if (M < 0)
18035 continue;
18036 int Idx = (SubLane * NumSubLaneElts) + Elt;
18037 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
18038 }
18039 }
18040
18041 // Shuffle each source sub-lane to its destination.
18042 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
18043 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
18044 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
18045 if (SrcSubLane < 0)
18046 continue;
18047 for (int j = 0; j != NumSubLaneElts; ++j)
18048 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
18049 }
18050
18051 // Avoid returning the same shuffle operation.
18052 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
18053 if (RepeatedMask == Mask || SubLaneMask == Mask)
18054 return SDValue();
18055
18056 SDValue RepeatedShuffle =
18057 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
18058
18059 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
18060 SubLaneMask);
18061 };
18062
18063 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
18064 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
18065 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
18066 // Otherwise we can only permute whole 128-bit lanes.
18067 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
18068 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
18069 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
18070 MinSubLaneScale = 2;
18071 MaxSubLaneScale =
18072 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
18073 }
18074 if (Subtarget.hasBWI() && VT == MVT::v64i8)
18075 MinSubLaneScale = MaxSubLaneScale = 4;
18076
18077 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
18078 if (SDValue Shuffle = ShuffleSubLanes(Scale))
18079 return Shuffle;
18080
18081 return SDValue();
18082}
18083
18084static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
18085 bool &ForceV1Zero, bool &ForceV2Zero,
18086 unsigned &ShuffleImm, ArrayRef<int> Mask,
18087 const APInt &Zeroable) {
18088 int NumElts = VT.getVectorNumElements();
18089 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))
18090 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))
18091 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))
;
18092 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__
__PRETTY_FUNCTION__))
18093 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__
__PRETTY_FUNCTION__))
;
18094
18095 bool ZeroLane[2] = { true, true };
18096 for (int i = 0; i < NumElts; ++i)
18097 ZeroLane[i & 1] &= Zeroable[i];
18098
18099 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
18100 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
18101 ShuffleImm = 0;
18102 bool ShufpdMask = true;
18103 bool CommutableMask = true;
18104 for (int i = 0; i < NumElts; ++i) {
18105 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
18106 continue;
18107 if (Mask[i] < 0)
18108 return false;
18109 int Val = (i & 6) + NumElts * (i & 1);
18110 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
18111 if (Mask[i] < Val || Mask[i] > Val + 1)
18112 ShufpdMask = false;
18113 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
18114 CommutableMask = false;
18115 ShuffleImm |= (Mask[i] % 2) << i;
18116 }
18117
18118 if (!ShufpdMask && !CommutableMask)
18119 return false;
18120
18121 if (!ShufpdMask && CommutableMask)
18122 std::swap(V1, V2);
18123
18124 ForceV1Zero = ZeroLane[0];
18125 ForceV2Zero = ZeroLane[1];
18126 return true;
18127}
18128
18129static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
18130 SDValue V2, ArrayRef<int> Mask,
18131 const APInt &Zeroable,
18132 const X86Subtarget &Subtarget,
18133 SelectionDAG &DAG) {
18134 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__
__PRETTY_FUNCTION__))
18135 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__
__PRETTY_FUNCTION__))
;
18136
18137 unsigned Immediate = 0;
18138 bool ForceV1Zero = false, ForceV2Zero = false;
18139 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
18140 Mask, Zeroable))
18141 return SDValue();
18142
18143 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
18144 if (ForceV1Zero)
18145 V1 = getZeroVector(VT, Subtarget, DAG, DL);
18146 if (ForceV2Zero)
18147 V2 = getZeroVector(VT, Subtarget, DAG, DL);
18148
18149 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
18150 DAG.getTargetConstant(Immediate, DL, MVT::i8));
18151}
18152
18153// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18154// by zeroable elements in the remaining 24 elements. Turn this into two
18155// vmovqb instructions shuffled together.
18156static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
18157 SDValue V1, SDValue V2,
18158 ArrayRef<int> Mask,
18159 const APInt &Zeroable,
18160 SelectionDAG &DAG) {
18161 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18161, __extension__
__PRETTY_FUNCTION__))
;
18162
18163 // The first 8 indices should be every 8th element.
18164 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
18165 return SDValue();
18166
18167 // Remaining elements need to be zeroable.
18168 if (Zeroable.countl_one() < (Mask.size() - 8))
18169 return SDValue();
18170
18171 V1 = DAG.getBitcast(MVT::v4i64, V1);
18172 V2 = DAG.getBitcast(MVT::v4i64, V2);
18173
18174 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
18175 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
18176
18177 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
18178 // the upper bits of the result using an unpckldq.
18179 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
18180 { 0, 1, 2, 3, 16, 17, 18, 19,
18181 4, 5, 6, 7, 20, 21, 22, 23 });
18182 // Insert the unpckldq into a zero vector to widen to v32i8.
18183 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
18184 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
18185 DAG.getIntPtrConstant(0, DL));
18186}
18187
18188// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
18189// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
18190// =>
18191// ul = unpckl v1, v2
18192// uh = unpckh v1, v2
18193// a = vperm ul, uh
18194// b = vperm ul, uh
18195//
18196// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
18197// and permute. We cannot directly match v3 because it is split into two
18198// 256-bit vectors in earlier isel stages. Therefore, this function matches a
18199// pair of 256-bit shuffles and makes sure the masks are consecutive.
18200//
18201// Once unpck and permute nodes are created, the permute corresponding to this
18202// shuffle is returned, while the other permute replaces the other half of the
18203// shuffle in the selection dag.
18204static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
18205 SDValue V1, SDValue V2,
18206 ArrayRef<int> Mask,
18207 SelectionDAG &DAG) {
18208 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
18209 VT != MVT::v32i8)
18210 return SDValue();
18211 // <B0, B1, B0+1, B1+1, ..., >
18212 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
18213 unsigned Begin1) {
18214 size_t Size = Mask.size();
18215 assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18215, __extension__
__PRETTY_FUNCTION__))
;
18216 for (unsigned I = 0; I < Size; I += 2) {
18217 if (Mask[I] != (int)(Begin0 + I / 2) ||
18218 Mask[I + 1] != (int)(Begin1 + I / 2))
18219 return false;
18220 }
18221 return true;
18222 };
18223 // Check which half is this shuffle node
18224 int NumElts = VT.getVectorNumElements();
18225 size_t FirstQtr = NumElts / 2;
18226 size_t ThirdQtr = NumElts + NumElts / 2;
18227 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
18228 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
18229 if (!IsFirstHalf && !IsSecondHalf)
18230 return SDValue();
18231
18232 // Find the intersection between shuffle users of V1 and V2.
18233 SmallVector<SDNode *, 2> Shuffles;
18234 for (SDNode *User : V1->uses())
18235 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
18236 User->getOperand(1) == V2)
18237 Shuffles.push_back(User);
18238 // Limit user size to two for now.
18239 if (Shuffles.size() != 2)
18240 return SDValue();
18241 // Find out which half of the 512-bit shuffles is each smaller shuffle
18242 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
18243 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
18244 SDNode *FirstHalf;
18245 SDNode *SecondHalf;
18246 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
18247 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
18248 FirstHalf = Shuffles[0];
18249 SecondHalf = Shuffles[1];
18250 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
18251 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
18252 FirstHalf = Shuffles[1];
18253 SecondHalf = Shuffles[0];
18254 } else {
18255 return SDValue();
18256 }
18257 // Lower into unpck and perm. Return the perm of this shuffle and replace
18258 // the other.
18259 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
18260 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
18261 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18262 DAG.getTargetConstant(0x20, DL, MVT::i8));
18263 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18264 DAG.getTargetConstant(0x31, DL, MVT::i8));
18265 if (IsFirstHalf) {
18266 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
18267 return Perm1;
18268 }
18269 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
18270 return Perm2;
18271}
18272
18273/// Handle lowering of 4-lane 64-bit floating point shuffles.
18274///
18275/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
18276/// isn't available.
18277static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18278 const APInt &Zeroable, SDValue V1, SDValue V2,
18279 const X86Subtarget &Subtarget,
18280 SelectionDAG &DAG) {
18281 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18281, __extension__
__PRETTY_FUNCTION__))
;
18282 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18282, __extension__
__PRETTY_FUNCTION__))
;
18283 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18283, __extension__
__PRETTY_FUNCTION__))
;
18284
18285 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
18286 Subtarget, DAG))
18287 return V;
18288
18289 if (V2.isUndef()) {
18290 // Check for being able to broadcast a single element.
18291 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
18292 Mask, Subtarget, DAG))
18293 return Broadcast;
18294
18295 // Use low duplicate instructions for masks that match their pattern.
18296 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
18297 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
18298
18299 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
18300 // Non-half-crossing single input shuffles can be lowered with an
18301 // interleaved permutation.
18302 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18303 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
18304 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
18305 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18306 }
18307
18308 // With AVX2 we have direct support for this permutation.
18309 if (Subtarget.hasAVX2())
18310 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
18311 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18312
18313 // Try to create an in-lane repeating shuffle mask and then shuffle the
18314 // results into the target lanes.
18315 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18316 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18317 return V;
18318
18319 // Try to permute the lanes and then use a per-lane permute.
18320 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
18321 Mask, DAG, Subtarget))
18322 return V;
18323
18324 // Otherwise, fall back.
18325 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
18326 DAG, Subtarget);
18327 }
18328
18329 // Use dedicated unpack instructions for masks that match their pattern.
18330 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
18331 return V;
18332
18333 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
18334 Zeroable, Subtarget, DAG))
18335 return Blend;
18336
18337 // Check if the blend happens to exactly fit that of SHUFPD.
18338 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
18339 Zeroable, Subtarget, DAG))
18340 return Op;
18341
18342 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18343 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18344
18345 // If we have lane crossing shuffles AND they don't all come from the lower
18346 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
18347 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
18348 // canonicalize to a blend of splat which isn't necessary for this combine.
18349 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
18350 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
18351 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
18352 (V2.getOpcode() != ISD::BUILD_VECTOR))
18353 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
18354
18355 // If we have one input in place, then we can permute the other input and
18356 // blend the result.
18357 if (V1IsInPlace || V2IsInPlace)
18358 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18359 Subtarget, DAG);
18360
18361 // Try to create an in-lane repeating shuffle mask and then shuffle the
18362 // results into the target lanes.
18363 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18364 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18365 return V;
18366
18367 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18368 // shuffle. However, if we have AVX2 and either inputs are already in place,
18369 // we will be able to shuffle even across lanes the other input in a single
18370 // instruction so skip this pattern.
18371 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
18372 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
18373 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18374 return V;
18375
18376 // If we have VLX support, we can use VEXPAND.
18377 if (Subtarget.hasVLX())
18378 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
18379 DAG, Subtarget))
18380 return V;
18381
18382 // If we have AVX2 then we always want to lower with a blend because an v4 we
18383 // can fully permute the elements.
18384 if (Subtarget.hasAVX2())
18385 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18386 Subtarget, DAG);
18387
18388 // Otherwise fall back on generic lowering.
18389 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
18390 Subtarget, DAG);
18391}
18392
18393/// Handle lowering of 4-lane 64-bit integer shuffles.
18394///
18395/// This routine is only called when we have AVX2 and thus a reasonable
18396/// instruction set for v4i64 shuffling..
18397static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18398 const APInt &Zeroable, SDValue V1, SDValue V2,
18399 const X86Subtarget &Subtarget,
18400 SelectionDAG &DAG) {
18401 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18401, __extension__
__PRETTY_FUNCTION__))
;
18402 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18402, __extension__
__PRETTY_FUNCTION__))
;
18403 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18403, __extension__
__PRETTY_FUNCTION__))
;
18404 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18404, __extension__
__PRETTY_FUNCTION__))
;
18405
18406 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18407 Subtarget, DAG))
18408 return V;
18409
18410 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
18411 Zeroable, Subtarget, DAG))
18412 return Blend;
18413
18414 // Check for being able to broadcast a single element.
18415 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
18416 Subtarget, DAG))
18417 return Broadcast;
18418
18419 // Try to use shift instructions if fast.
18420 if (Subtarget.preferLowerShuffleAsShift())
18421 if (SDValue Shift =
18422 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18423 Subtarget, DAG, /*BitwiseOnly*/ true))
18424 return Shift;
18425
18426 if (V2.isUndef()) {
18427 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18428 // can use lower latency instructions that will operate on both lanes.
18429 SmallVector<int, 2> RepeatedMask;
18430 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
18431 SmallVector<int, 4> PSHUFDMask;
18432 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
18433 return DAG.getBitcast(
18434 MVT::v4i64,
18435 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
18436 DAG.getBitcast(MVT::v8i32, V1),
18437 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18438 }
18439
18440 // AVX2 provides a direct instruction for permuting a single input across
18441 // lanes.
18442 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
18443 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18444 }
18445
18446 // Try to use shift instructions.
18447 if (SDValue Shift =
18448 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
18449 DAG, /*BitwiseOnly*/ false))
18450 return Shift;
18451
18452 // If we have VLX support, we can use VALIGN or VEXPAND.
18453 if (Subtarget.hasVLX()) {
18454 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
18455 Subtarget, DAG))
18456 return Rotate;
18457
18458 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
18459 DAG, Subtarget))
18460 return V;
18461 }
18462
18463 // Try to use PALIGNR.
18464 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
18465 Subtarget, DAG))
18466 return Rotate;
18467
18468 // Use dedicated unpack instructions for masks that match their pattern.
18469 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
18470 return V;
18471
18472 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18473 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18474
18475 // If we have one input in place, then we can permute the other input and
18476 // blend the result.
18477 if (V1IsInPlace || V2IsInPlace)
18478 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18479 Subtarget, DAG);
18480
18481 // Try to create an in-lane repeating shuffle mask and then shuffle the
18482 // results into the target lanes.
18483 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18484 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18485 return V;
18486
18487 // Try to lower to PERMQ(BLENDD(V1,V2)).
18488 if (SDValue V =
18489 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
18490 return V;
18491
18492 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18493 // shuffle. However, if we have AVX2 and either inputs are already in place,
18494 // we will be able to shuffle even across lanes the other input in a single
18495 // instruction so skip this pattern.
18496 if (!V1IsInPlace && !V2IsInPlace)
18497 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18498 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18499 return Result;
18500
18501 // Otherwise fall back on generic blend lowering.
18502 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18503 Subtarget, DAG);
18504}
18505
18506/// Handle lowering of 8-lane 32-bit floating point shuffles.
18507///
18508/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
18509/// isn't available.
18510static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18511 const APInt &Zeroable, SDValue V1, SDValue V2,
18512 const X86Subtarget &Subtarget,
18513 SelectionDAG &DAG) {
18514 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18514, __extension__
__PRETTY_FUNCTION__))
;
18515 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18515, __extension__
__PRETTY_FUNCTION__))
;
18516 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18516, __extension__
__PRETTY_FUNCTION__))
;
18517
18518 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
18519 Zeroable, Subtarget, DAG))
18520 return Blend;
18521
18522 // Check for being able to broadcast a single element.
18523 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
18524 Subtarget, DAG))
18525 return Broadcast;
18526
18527 if (!Subtarget.hasAVX2()) {
18528 SmallVector<int> InLaneMask;
18529 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
18530
18531 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
18532 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
18533 /*SimpleOnly*/ true))
18534 return R;
18535 }
18536 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18537 Zeroable, Subtarget, DAG))
18538 return DAG.getBitcast(MVT::v8f32, ZExt);
18539
18540 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18541 // options to efficiently lower the shuffle.
18542 SmallVector<int, 4> RepeatedMask;
18543 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
18544 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__
__PRETTY_FUNCTION__))
18545 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__
__PRETTY_FUNCTION__))
;
18546
18547 // Use even/odd duplicate instructions for masks that match their pattern.
18548 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18549 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
18550 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18551 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
18552
18553 if (V2.isUndef())
18554 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
18555 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18556
18557 // Use dedicated unpack instructions for masks that match their pattern.
18558 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
18559 return V;
18560
18561 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
18562 // have already handled any direct blends.
18563 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
18564 }
18565
18566 // Try to create an in-lane repeating shuffle mask and then shuffle the
18567 // results into the target lanes.
18568 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18569 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18570 return V;
18571
18572 // If we have a single input shuffle with different shuffle patterns in the
18573 // two 128-bit lanes use the variable mask to VPERMILPS.
18574 if (V2.isUndef()) {
18575 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
18576 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18577 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
18578 }
18579 if (Subtarget.hasAVX2()) {
18580 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18581 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
18582 }
18583 // Otherwise, fall back.
18584 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
18585 DAG, Subtarget);
18586 }
18587
18588 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18589 // shuffle.
18590 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18591 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18592 return Result;
18593
18594 // If we have VLX support, we can use VEXPAND.
18595 if (Subtarget.hasVLX())
18596 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
18597 DAG, Subtarget))
18598 return V;
18599
18600 // Try to match an interleave of two v8f32s and lower them as unpck and
18601 // permutes using ymms. This needs to go before we try to split the vectors.
18602 //
18603 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
18604 // this path inadvertently.
18605 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
18606 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
18607 Mask, DAG))
18608 return V;
18609
18610 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18611 // since after split we get a more efficient code using vpunpcklwd and
18612 // vpunpckhwd instrs than vblend.
18613 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
18614 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
18615 DAG);
18616
18617 // If we have AVX2 then we always want to lower with a blend because at v8 we
18618 // can fully permute the elements.
18619 if (Subtarget.hasAVX2())
18620 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
18621 Subtarget, DAG);
18622
18623 // Otherwise fall back on generic lowering.
18624 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
18625 Subtarget, DAG);
18626}
18627
18628/// Handle lowering of 8-lane 32-bit integer shuffles.
18629///
18630/// This routine is only called when we have AVX2 and thus a reasonable
18631/// instruction set for v8i32 shuffling..
18632static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18633 const APInt &Zeroable, SDValue V1, SDValue V2,
18634 const X86Subtarget &Subtarget,
18635 SelectionDAG &DAG) {
18636 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18636, __extension__
__PRETTY_FUNCTION__))
;
18637 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18637, __extension__
__PRETTY_FUNCTION__))
;
18638 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18638, __extension__
__PRETTY_FUNCTION__))
;
18639 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18639, __extension__
__PRETTY_FUNCTION__))
;
18640
18641 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
18642
18643 // Whenever we can lower this as a zext, that instruction is strictly faster
18644 // than any alternative. It also allows us to fold memory operands into the
18645 // shuffle in many cases.
18646 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18647 Zeroable, Subtarget, DAG))
18648 return ZExt;
18649
18650 // Try to match an interleave of two v8i32s and lower them as unpck and
18651 // permutes using ymms. This needs to go before we try to split the vectors.
18652 if (!Subtarget.hasAVX512())
18653 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
18654 Mask, DAG))
18655 return V;
18656
18657 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18658 // since after split we get a more efficient code than vblend by using
18659 // vpunpcklwd and vpunpckhwd instrs.
18660 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
18661 !Subtarget.hasAVX512())
18662 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
18663 DAG);
18664
18665 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
18666 Zeroable, Subtarget, DAG))
18667 return Blend;
18668
18669 // Check for being able to broadcast a single element.
18670 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
18671 Subtarget, DAG))
18672 return Broadcast;
18673
18674 // Try to use shift instructions if fast.
18675 if (Subtarget.preferLowerShuffleAsShift()) {
18676 if (SDValue Shift =
18677 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
18678 Subtarget, DAG, /*BitwiseOnly*/ true))
18679 return Shift;
18680 if (NumV2Elements == 0)
18681 if (SDValue Rotate =
18682 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18683 return Rotate;
18684 }
18685
18686 // If the shuffle mask is repeated in each 128-bit lane we can use more
18687 // efficient instructions that mirror the shuffles across the two 128-bit
18688 // lanes.
18689 SmallVector<int, 4> RepeatedMask;
18690 bool Is128BitLaneRepeatedShuffle =
18691 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
18692 if (Is128BitLaneRepeatedShuffle) {
18693 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18693, __extension__
__PRETTY_FUNCTION__))
;
18694 if (V2.isUndef())
18695 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
18696 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18697
18698 // Use dedicated unpack instructions for masks that match their pattern.
18699 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
18700 return V;
18701 }
18702
18703 // Try to use shift instructions.
18704 if (SDValue Shift =
18705 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
18706 DAG, /*BitwiseOnly*/ false))
18707 return Shift;
18708
18709 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
18710 if (SDValue Rotate =
18711 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18712 return Rotate;
18713
18714 // If we have VLX support, we can use VALIGN or EXPAND.
18715 if (Subtarget.hasVLX()) {
18716 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
18717 Subtarget, DAG))
18718 return Rotate;
18719
18720 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
18721 DAG, Subtarget))
18722 return V;
18723 }
18724
18725 // Try to use byte rotation instructions.
18726 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
18727 Subtarget, DAG))
18728 return Rotate;
18729
18730 // Try to create an in-lane repeating shuffle mask and then shuffle the
18731 // results into the target lanes.
18732 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18733 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18734 return V;
18735
18736 if (V2.isUndef()) {
18737 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18738 // because that should be faster than the variable permute alternatives.
18739 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
18740 return V;
18741
18742 // If the shuffle patterns aren't repeated but it's a single input, directly
18743 // generate a cross-lane VPERMD instruction.
18744 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18745 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
18746 }
18747
18748 // Assume that a single SHUFPS is faster than an alternative sequence of
18749 // multiple instructions (even if the CPU has a domain penalty).
18750 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18751 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18752 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
18753 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
18754 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
18755 CastV1, CastV2, DAG);
18756 return DAG.getBitcast(MVT::v8i32, ShufPS);
18757 }
18758
18759 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18760 // shuffle.
18761 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18762 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18763 return Result;
18764
18765 // Otherwise fall back on generic blend lowering.
18766 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
18767 Subtarget, DAG);
18768}
18769
18770/// Handle lowering of 16-lane 16-bit integer shuffles.
18771///
18772/// This routine is only called when we have AVX2 and thus a reasonable
18773/// instruction set for v16i16 shuffling..
18774static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18775 const APInt &Zeroable, SDValue V1, SDValue V2,
18776 const X86Subtarget &Subtarget,
18777 SelectionDAG &DAG) {
18778 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18778, __extension__
__PRETTY_FUNCTION__))
;
18779 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18779, __extension__
__PRETTY_FUNCTION__))
;
18780 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18780, __extension__
__PRETTY_FUNCTION__))
;
18781 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18781, __extension__
__PRETTY_FUNCTION__))
;
18782
18783 // Whenever we can lower this as a zext, that instruction is strictly faster
18784 // than any alternative. It also allows us to fold memory operands into the
18785 // shuffle in many cases.
18786 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18787 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18788 return ZExt;
18789
18790 // Check for being able to broadcast a single element.
18791 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
18792 Subtarget, DAG))
18793 return Broadcast;
18794
18795 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
18796 Zeroable, Subtarget, DAG))
18797 return Blend;
18798
18799 // Use dedicated unpack instructions for masks that match their pattern.
18800 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
18801 return V;
18802
18803 // Use dedicated pack instructions for masks that match their pattern.
18804 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
18805 Subtarget))
18806 return V;
18807
18808 // Try to use lower using a truncation.
18809 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18810 Subtarget, DAG))
18811 return V;
18812
18813 // Try to use shift instructions.
18814 if (SDValue Shift =
18815 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18816 Subtarget, DAG, /*BitwiseOnly*/ false))
18817 return Shift;
18818
18819 // Try to use byte rotation instructions.
18820 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
18821 Subtarget, DAG))
18822 return Rotate;
18823
18824 // Try to create an in-lane repeating shuffle mask and then shuffle the
18825 // results into the target lanes.
18826 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18827 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18828 return V;
18829
18830 if (V2.isUndef()) {
18831 // Try to use bit rotation instructions.
18832 if (SDValue Rotate =
18833 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
18834 return Rotate;
18835
18836 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18837 // because that should be faster than the variable permute alternatives.
18838 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
18839 return V;
18840
18841 // There are no generalized cross-lane shuffle operations available on i16
18842 // element types.
18843 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18844 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18845 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18846 return V;
18847
18848 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18849 DAG, Subtarget);
18850 }
18851
18852 SmallVector<int, 8> RepeatedMask;
18853 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18854 // As this is a single-input shuffle, the repeated mask should be
18855 // a strictly valid v8i16 mask that we can pass through to the v8i16
18856 // lowering to handle even the v16 case.
18857 return lowerV8I16GeneralSingleInputShuffle(
18858 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18859 }
18860 }
18861
18862 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
18863 Zeroable, Subtarget, DAG))
18864 return PSHUFB;
18865
18866 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18867 if (Subtarget.hasBWI())
18868 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18869
18870 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18871 // shuffle.
18872 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18873 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18874 return Result;
18875
18876 // Try to permute the lanes and then use a per-lane permute.
18877 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18878 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18879 return V;
18880
18881 // Try to match an interleave of two v16i16s and lower them as unpck and
18882 // permutes using ymms.
18883 if (!Subtarget.hasAVX512())
18884 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
18885 Mask, DAG))
18886 return V;
18887
18888 // Otherwise fall back on generic lowering.
18889 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18890 Subtarget, DAG);
18891}
18892
18893/// Handle lowering of 32-lane 8-bit integer shuffles.
18894///
18895/// This routine is only called when we have AVX2 and thus a reasonable
18896/// instruction set for v32i8 shuffling..
18897static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18898 const APInt &Zeroable, SDValue V1, SDValue V2,
18899 const X86Subtarget &Subtarget,
18900 SelectionDAG &DAG) {
18901 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18901, __extension__
__PRETTY_FUNCTION__))
;
18902 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18902, __extension__
__PRETTY_FUNCTION__))
;
18903 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18903, __extension__
__PRETTY_FUNCTION__))
;
18904 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18904, __extension__
__PRETTY_FUNCTION__))
;
18905
18906 // Whenever we can lower this as a zext, that instruction is strictly faster
18907 // than any alternative. It also allows us to fold memory operands into the
18908 // shuffle in many cases.
18909 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18910 Zeroable, Subtarget, DAG))
18911 return ZExt;
18912
18913 // Check for being able to broadcast a single element.
18914 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18915 Subtarget, DAG))
18916 return Broadcast;
18917
18918 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18919 Zeroable, Subtarget, DAG))
18920 return Blend;
18921
18922 // Use dedicated unpack instructions for masks that match their pattern.
18923 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18924 return V;
18925
18926 // Use dedicated pack instructions for masks that match their pattern.
18927 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18928 Subtarget))
18929 return V;
18930
18931 // Try to use lower using a truncation.
18932 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18933 Subtarget, DAG))
18934 return V;
18935
18936 // Try to use shift instructions.
18937 if (SDValue Shift =
18938 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
18939 DAG, /*BitwiseOnly*/ false))
18940 return Shift;
18941
18942 // Try to use byte rotation instructions.
18943 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18944 Subtarget, DAG))
18945 return Rotate;
18946
18947 // Try to use bit rotation instructions.
18948 if (V2.isUndef())
18949 if (SDValue Rotate =
18950 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18951 return Rotate;
18952
18953 // Try to create an in-lane repeating shuffle mask and then shuffle the
18954 // results into the target lanes.
18955 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18956 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18957 return V;
18958
18959 // There are no generalized cross-lane shuffle operations available on i8
18960 // element types.
18961 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18962 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18963 // because that should be faster than the variable permute alternatives.
18964 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18965 return V;
18966
18967 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18968 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18969 return V;
18970
18971 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18972 DAG, Subtarget);
18973 }
18974
18975 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18976 Zeroable, Subtarget, DAG))
18977 return PSHUFB;
18978
18979 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18980 if (Subtarget.hasVBMI())
18981 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18982
18983 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18984 // shuffle.
18985 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18986 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18987 return Result;
18988
18989 // Try to permute the lanes and then use a per-lane permute.
18990 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18991 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18992 return V;
18993
18994 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18995 // by zeroable elements in the remaining 24 elements. Turn this into two
18996 // vmovqb instructions shuffled together.
18997 if (Subtarget.hasVLX())
18998 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18999 Mask, Zeroable, DAG))
19000 return V;
19001
19002 // Try to match an interleave of two v32i8s and lower them as unpck and
19003 // permutes using ymms.
19004 if (!Subtarget.hasAVX512())
19005 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
19006 Mask, DAG))
19007 return V;
19008
19009 // Otherwise fall back on generic lowering.
19010 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
19011 Subtarget, DAG);
19012}
19013
19014/// High-level routine to lower various 256-bit x86 vector shuffles.
19015///
19016/// This routine either breaks down the specific type of a 256-bit x86 vector
19017/// shuffle or splits it into two 128-bit shuffles and fuses the results back
19018/// together based on the available instructions.
19019static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
19020 SDValue V1, SDValue V2, const APInt &Zeroable,
19021 const X86Subtarget &Subtarget,
19022 SelectionDAG &DAG) {
19023 // If we have a single input to the zero element, insert that into V1 if we
19024 // can do so cheaply.
19025 int NumElts = VT.getVectorNumElements();
19026 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19027
19028 if (NumV2Elements == 1 && Mask[0] >= NumElts)
19029 if (SDValue Insertion = lowerShuffleAsElementInsertion(
19030 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19031 return Insertion;
19032
19033 // Handle special cases where the lower or upper half is UNDEF.
19034 if (SDValue V =
19035 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19036 return V;
19037
19038 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
19039 // can check for those subtargets here and avoid much of the subtarget
19040 // querying in the per-vector-type lowering routines. With AVX1 we have
19041 // essentially *zero* ability to manipulate a 256-bit vector with integer
19042 // types. Since we'll use floating point types there eventually, just
19043 // immediately cast everything to a float and operate entirely in that domain.
19044 if (VT.isInteger() && !Subtarget.hasAVX2()) {
19045 int ElementBits = VT.getScalarSizeInBits();
19046 if (ElementBits < 32) {
19047 // No floating point type available, if we can't use the bit operations
19048 // for masking/blending then decompose into 128-bit vectors.
19049 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19050 Subtarget, DAG))
19051 return V;
19052 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19053 return V;
19054 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19055 }
19056
19057 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
19058 VT.getVectorNumElements());
19059 V1 = DAG.getBitcast(FpVT, V1);
19060 V2 = DAG.getBitcast(FpVT, V2);
19061 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
19062 }
19063
19064 if (VT == MVT::v16f16) {
19065 V1 = DAG.getBitcast(MVT::v16i16, V1);
19066 V2 = DAG.getBitcast(MVT::v16i16, V2);
19067 return DAG.getBitcast(MVT::v16f16,
19068 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
19069 }
19070
19071 switch (VT.SimpleTy) {
19072 case MVT::v4f64:
19073 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19074 case MVT::v4i64:
19075 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19076 case MVT::v8f32:
19077 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19078 case MVT::v8i32:
19079 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19080 case MVT::v16i16:
19081 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19082 case MVT::v32i8:
19083 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19084
19085 default:
19086 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19086)
;
19087 }
19088}
19089
19090/// Try to lower a vector shuffle as a 128-bit shuffles.
19091static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
19092 const APInt &Zeroable, SDValue V1, SDValue V2,
19093 const X86Subtarget &Subtarget,
19094 SelectionDAG &DAG) {
19095 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__
__PRETTY_FUNCTION__))
19096 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__
__PRETTY_FUNCTION__))
;
19097
19098 // To handle 256 bit vector requires VLX and most probably
19099 // function lowerV2X128VectorShuffle() is better solution.
19100 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19100, __extension__
__PRETTY_FUNCTION__))
;
19101
19102 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
19103 SmallVector<int, 4> Widened128Mask;
19104 if (!canWidenShuffleElements(Mask, Widened128Mask))
19105 return SDValue();
19106 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19106, __extension__
__PRETTY_FUNCTION__))
;
19107
19108 // Try to use an insert into a zero vector.
19109 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
19110 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
19111 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
19112 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
19113 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
19114 DAG.getIntPtrConstant(0, DL));
19115 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19116 getZeroVector(VT, Subtarget, DAG, DL), LoV,
19117 DAG.getIntPtrConstant(0, DL));
19118 }
19119
19120 // Check for patterns which can be matched with a single insert of a 256-bit
19121 // subvector.
19122 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
19123 if (OnlyUsesV1 ||
19124 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
19125 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
19126 SDValue SubVec =
19127 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
19128 DAG.getIntPtrConstant(0, DL));
19129 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
19130 DAG.getIntPtrConstant(4, DL));
19131 }
19132
19133 // See if this is an insertion of the lower 128-bits of V2 into V1.
19134 bool IsInsert = true;
19135 int V2Index = -1;
19136 for (int i = 0; i < 4; ++i) {
19137 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19137, __extension__
__PRETTY_FUNCTION__))
;
19138 if (Widened128Mask[i] < 0)
19139 continue;
19140
19141 // Make sure all V1 subvectors are in place.
19142 if (Widened128Mask[i] < 4) {
19143 if (Widened128Mask[i] != i) {
19144 IsInsert = false;
19145 break;
19146 }
19147 } else {
19148 // Make sure we only have a single V2 index and its the lowest 128-bits.
19149 if (V2Index >= 0 || Widened128Mask[i] != 4) {
19150 IsInsert = false;
19151 break;
19152 }
19153 V2Index = i;
19154 }
19155 }
19156 if (IsInsert && V2Index >= 0) {
19157 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
19158 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
19159 DAG.getIntPtrConstant(0, DL));
19160 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
19161 }
19162
19163 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
19164 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
19165 // possible we at least ensure the lanes stay sequential to help later
19166 // combines.
19167 SmallVector<int, 2> Widened256Mask;
19168 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
19169 Widened128Mask.clear();
19170 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
19171 }
19172
19173 // Try to lower to vshuf64x2/vshuf32x4.
19174 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
19175 unsigned PermMask = 0;
19176 // Insure elements came from the same Op.
19177 for (int i = 0; i < 4; ++i) {
19178 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19178, __extension__
__PRETTY_FUNCTION__))
;
19179 if (Widened128Mask[i] < 0)
19180 continue;
19181
19182 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
19183 unsigned OpIndex = i / 2;
19184 if (Ops[OpIndex].isUndef())
19185 Ops[OpIndex] = Op;
19186 else if (Ops[OpIndex] != Op)
19187 return SDValue();
19188
19189 // Convert the 128-bit shuffle mask selection values into 128-bit selection
19190 // bits defined by a vshuf64x2 instruction's immediate control byte.
19191 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
19192 }
19193
19194 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
19195 DAG.getTargetConstant(PermMask, DL, MVT::i8));
19196}
19197
19198/// Handle lowering of 8-lane 64-bit floating point shuffles.
19199static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19200 const APInt &Zeroable, SDValue V1, SDValue V2,
19201 const X86Subtarget &Subtarget,
19202 SelectionDAG &DAG) {
19203 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19203, __extension__
__PRETTY_FUNCTION__))
;
19204 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19204, __extension__
__PRETTY_FUNCTION__))
;
19205 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19205, __extension__
__PRETTY_FUNCTION__))
;
19206
19207 if (V2.isUndef()) {
19208 // Use low duplicate instructions for masks that match their pattern.
19209 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
19210 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
19211
19212 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
19213 // Non-half-crossing single input shuffles can be lowered with an
19214 // interleaved permutation.
19215 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
19216 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
19217 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
19218 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
19219 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
19220 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
19221 }
19222
19223 SmallVector<int, 4> RepeatedMask;
19224 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
19225 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
19226 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19227 }
19228
19229 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
19230 V2, Subtarget, DAG))
19231 return Shuf128;
19232
19233 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
19234 return Unpck;
19235
19236 // Check if the blend happens to exactly fit that of SHUFPD.
19237 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
19238 Zeroable, Subtarget, DAG))
19239 return Op;
19240
19241 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
19242 DAG, Subtarget))
19243 return V;
19244
19245 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
19246 Zeroable, Subtarget, DAG))
19247 return Blend;
19248
19249 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
19250}
19251
19252/// Handle lowering of 16-lane 32-bit floating point shuffles.
19253static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19254 const APInt &Zeroable, SDValue V1, SDValue V2,
19255 const X86Subtarget &Subtarget,
19256 SelectionDAG &DAG) {
19257 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19257, __extension__
__PRETTY_FUNCTION__))
;
19258 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19258, __extension__
__PRETTY_FUNCTION__))
;
19259 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19259, __extension__
__PRETTY_FUNCTION__))
;
19260
19261 // If the shuffle mask is repeated in each 128-bit lane, we have many more
19262 // options to efficiently lower the shuffle.
19263 SmallVector<int, 4> RepeatedMask;
19264 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
19265 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19265, __extension__
__PRETTY_FUNCTION__))
;
19266
19267 // Use even/odd duplicate instructions for masks that match their pattern.
19268 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
19269 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
19270 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
19271 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
19272
19273 if (V2.isUndef())
19274 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
19275 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19276
19277 // Use dedicated unpack instructions for masks that match their pattern.
19278 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
19279 return V;
19280
19281 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19282 Zeroable, Subtarget, DAG))
19283 return Blend;
19284
19285 // Otherwise, fall back to a SHUFPS sequence.
19286 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
19287 }
19288
19289 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19290 Zeroable, Subtarget, DAG))
19291 return Blend;
19292
19293 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19294 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19295 return DAG.getBitcast(MVT::v16f32, ZExt);
19296
19297 // Try to create an in-lane repeating shuffle mask and then shuffle the
19298 // results into the target lanes.
19299 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19300 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
19301 return V;
19302
19303 // If we have a single input shuffle with different shuffle patterns in the
19304 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
19305 if (V2.isUndef() &&
19306 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
19307 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
19308 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
19309 }
19310
19311 // If we have AVX512F support, we can use VEXPAND.
19312 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
19313 V1, V2, DAG, Subtarget))
19314 return V;
19315
19316 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
19317}
19318
19319/// Handle lowering of 8-lane 64-bit integer shuffles.
19320static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19321 const APInt &Zeroable, SDValue V1, SDValue V2,
19322 const X86Subtarget &Subtarget,
19323 SelectionDAG &DAG) {
19324 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19324, __extension__
__PRETTY_FUNCTION__))
;
19325 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19325, __extension__
__PRETTY_FUNCTION__))
;
19326 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19326, __extension__
__PRETTY_FUNCTION__))
;
19327
19328 // Try to use shift instructions if fast.
19329 if (Subtarget.preferLowerShuffleAsShift())
19330 if (SDValue Shift =
19331 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
19332 Subtarget, DAG, /*BitwiseOnly*/ true))
19333 return Shift;
19334
19335 if (V2.isUndef()) {
19336 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
19337 // can use lower latency instructions that will operate on all four
19338 // 128-bit lanes.
19339 SmallVector<int, 2> Repeated128Mask;
19340 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
19341 SmallVector<int, 4> PSHUFDMask;
19342 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
19343 return DAG.getBitcast(
19344 MVT::v8i64,
19345 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
19346 DAG.getBitcast(MVT::v16i32, V1),
19347 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
19348 }
19349
19350 SmallVector<int, 4> Repeated256Mask;
19351 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
19352 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
19353 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
19354 }
19355
19356 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
19357 V2, Subtarget, DAG))
19358 return Shuf128;
19359
19360 // Try to use shift instructions.
19361 if (SDValue Shift =
19362 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
19363 DAG, /*BitwiseOnly*/ false))
19364 return Shift;
19365
19366 // Try to use VALIGN.
19367 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
19368 Subtarget, DAG))
19369 return Rotate;
19370
19371 // Try to use PALIGNR.
19372 if (Subtarget.hasBWI())
19373 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
19374 Subtarget, DAG))
19375 return Rotate;
19376
19377 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
19378 return Unpck;
19379
19380 // If we have AVX512F support, we can use VEXPAND.
19381 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
19382 DAG, Subtarget))
19383 return V;
19384
19385 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
19386 Zeroable, Subtarget, DAG))
19387 return Blend;
19388
19389 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
19390}
19391
19392/// Handle lowering of 16-lane 32-bit integer shuffles.
19393static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19394 const APInt &Zeroable, SDValue V1, SDValue V2,
19395 const X86Subtarget &Subtarget,
19396 SelectionDAG &DAG) {
19397 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19397, __extension__
__PRETTY_FUNCTION__))
;
19398 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19398, __extension__
__PRETTY_FUNCTION__))
;
19399 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19399, __extension__
__PRETTY_FUNCTION__))
;
19400
19401 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
19402
19403 // Whenever we can lower this as a zext, that instruction is strictly faster
19404 // than any alternative. It also allows us to fold memory operands into the
19405 // shuffle in many cases.
19406 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19407 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19408 return ZExt;
19409
19410 // Try to use shift instructions if fast.
19411 if (Subtarget.preferLowerShuffleAsShift()) {
19412 if (SDValue Shift =
19413 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19414 Subtarget, DAG, /*BitwiseOnly*/ true))
19415 return Shift;
19416 if (NumV2Elements == 0)
19417 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
19418 Subtarget, DAG))
19419 return Rotate;
19420 }
19421
19422 // If the shuffle mask is repeated in each 128-bit lane we can use more
19423 // efficient instructions that mirror the shuffles across the four 128-bit
19424 // lanes.
19425 SmallVector<int, 4> RepeatedMask;
19426 bool Is128BitLaneRepeatedShuffle =
19427 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
19428 if (Is128BitLaneRepeatedShuffle) {
19429 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19429, __extension__
__PRETTY_FUNCTION__))
;
19430 if (V2.isUndef())
19431 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
19432 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19433
19434 // Use dedicated unpack instructions for masks that match their pattern.
19435 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
19436 return V;
19437 }
19438
19439 // Try to use shift instructions.
19440 if (SDValue Shift =
19441 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19442 Subtarget, DAG, /*BitwiseOnly*/ false))
19443 return Shift;
19444
19445 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
19446 if (SDValue Rotate =
19447 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
19448 return Rotate;
19449
19450 // Try to use VALIGN.
19451 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
19452 Subtarget, DAG))
19453 return Rotate;
19454
19455 // Try to use byte rotation instructions.
19456 if (Subtarget.hasBWI())
19457 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
19458 Subtarget, DAG))
19459 return Rotate;
19460
19461 // Assume that a single SHUFPS is faster than using a permv shuffle.
19462 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
19463 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
19464 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
19465 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
19466 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
19467 CastV1, CastV2, DAG);
19468 return DAG.getBitcast(MVT::v16i32, ShufPS);
19469 }
19470
19471 // Try to create an in-lane repeating shuffle mask and then shuffle the
19472 // results into the target lanes.
19473 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19474 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
19475 return V;
19476
19477 // If we have AVX512F support, we can use VEXPAND.
19478 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
19479 DAG, Subtarget))
19480 return V;
19481
19482 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
19483 Zeroable, Subtarget, DAG))
19484 return Blend;
19485
19486 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
19487}
19488
19489/// Handle lowering of 32-lane 16-bit integer shuffles.
19490static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19491 const APInt &Zeroable, SDValue V1, SDValue V2,
19492 const X86Subtarget &Subtarget,
19493 SelectionDAG &DAG) {
19494 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19494, __extension__
__PRETTY_FUNCTION__))
;
19495 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19495, __extension__
__PRETTY_FUNCTION__))
;
19496 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__))
;
19497 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19497, __extension__
__PRETTY_FUNCTION__))
;
19498
19499 // Whenever we can lower this as a zext, that instruction is strictly faster
19500 // than any alternative. It also allows us to fold memory operands into the
19501 // shuffle in many cases.
19502 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19503 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
19504 return ZExt;
19505
19506 // Use dedicated unpack instructions for masks that match their pattern.
19507 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
19508 return V;
19509
19510 // Use dedicated pack instructions for masks that match their pattern.
19511 if (SDValue V =
19512 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
19513 return V;
19514
19515 // Try to use shift instructions.
19516 if (SDValue Shift =
19517 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
19518 Subtarget, DAG, /*BitwiseOnly*/ false))
19519 return Shift;
19520
19521 // Try to use byte rotation instructions.
19522 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
19523 Subtarget, DAG))
19524 return Rotate;
19525
19526 if (V2.isUndef()) {
19527 // Try to use bit rotation instructions.
19528 if (SDValue Rotate =
19529 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
19530 return Rotate;
19531
19532 SmallVector<int, 8> RepeatedMask;
19533 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
19534 // As this is a single-input shuffle, the repeated mask should be
19535 // a strictly valid v8i16 mask that we can pass through to the v8i16
19536 // lowering to handle even the v32 case.
19537 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
19538 RepeatedMask, Subtarget, DAG);
19539 }
19540 }
19541
19542 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
19543 Zeroable, Subtarget, DAG))
19544 return Blend;
19545
19546 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
19547 Zeroable, Subtarget, DAG))
19548 return PSHUFB;
19549
19550 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
19551}
19552
19553/// Handle lowering of 64-lane 8-bit integer shuffles.
19554static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19555 const APInt &Zeroable, SDValue V1, SDValue V2,
19556 const X86Subtarget &Subtarget,
19557 SelectionDAG &DAG) {
19558 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19558, __extension__
__PRETTY_FUNCTION__))
;
19559 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19559, __extension__
__PRETTY_FUNCTION__))
;
19560 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19560, __extension__
__PRETTY_FUNCTION__))
;
19561 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19561, __extension__
__PRETTY_FUNCTION__))
;
19562
19563 // Whenever we can lower this as a zext, that instruction is strictly faster
19564 // than any alternative. It also allows us to fold memory operands into the
19565 // shuffle in many cases.
19566 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19567 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
19568 return ZExt;
19569
19570 // Use dedicated unpack instructions for masks that match their pattern.
19571 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
19572 return V;
19573
19574 // Use dedicated pack instructions for masks that match their pattern.
19575 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
19576 Subtarget))
19577 return V;
19578
19579 // Try to use shift instructions.
19580 if (SDValue Shift =
19581 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
19582 DAG, /*BitwiseOnly*/ false))
19583 return Shift;
19584
19585 // Try to use byte rotation instructions.
19586 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
19587 Subtarget, DAG))
19588 return Rotate;
19589
19590 // Try to use bit rotation instructions.
19591 if (V2.isUndef())
19592 if (SDValue Rotate =
19593 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
19594 return Rotate;
19595
19596 // Lower as AND if possible.
19597 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
19598 Zeroable, Subtarget, DAG))
19599 return Masked;
19600
19601 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
19602 Zeroable, Subtarget, DAG))
19603 return PSHUFB;
19604
19605 // Try to create an in-lane repeating shuffle mask and then shuffle the
19606 // results into the target lanes.
19607 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19608 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19609 return V;
19610
19611 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
19612 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
19613 return Result;
19614
19615 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
19616 Zeroable, Subtarget, DAG))
19617 return Blend;
19618
19619 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
19620 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
19621 // PALIGNR will be cheaper than the second PSHUFB+OR.
19622 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
19623 Mask, Subtarget, DAG))
19624 return V;
19625
19626 // If we can't directly blend but can use PSHUFB, that will be better as it
19627 // can both shuffle and set up the inefficient blend.
19628 bool V1InUse, V2InUse;
19629 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
19630 DAG, V1InUse, V2InUse);
19631 }
19632
19633 // Try to simplify this by merging 128-bit lanes to enable a lane-based
19634 // shuffle.
19635 if (!V2.isUndef())
19636 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19637 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19638 return Result;
19639
19640 // VBMI can use VPERMV/VPERMV3 byte shuffles.
19641 if (Subtarget.hasVBMI())
19642 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
19643
19644 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19645}
19646
19647/// High-level routine to lower various 512-bit x86 vector shuffles.
19648///
19649/// This routine either breaks down the specific type of a 512-bit x86 vector
19650/// shuffle or splits it into two 256-bit shuffles and fuses the results back
19651/// together based on the available instructions.
19652static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19653 MVT VT, SDValue V1, SDValue V2,
19654 const APInt &Zeroable,
19655 const X86Subtarget &Subtarget,
19656 SelectionDAG &DAG) {
19657 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__
__PRETTY_FUNCTION__))
19658 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__
__PRETTY_FUNCTION__))
;
19659
19660 // If we have a single input to the zero element, insert that into V1 if we
19661 // can do so cheaply.
19662 int NumElts = Mask.size();
19663 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19664
19665 if (NumV2Elements == 1 && Mask[0] >= NumElts)
19666 if (SDValue Insertion = lowerShuffleAsElementInsertion(
19667 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19668 return Insertion;
19669
19670 // Handle special cases where the lower or upper half is UNDEF.
19671 if (SDValue V =
19672 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19673 return V;
19674
19675 // Check for being able to broadcast a single element.
19676 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
19677 Subtarget, DAG))
19678 return Broadcast;
19679
19680 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
19681 // Try using bit ops for masking and blending before falling back to
19682 // splitting.
19683 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19684 Subtarget, DAG))
19685 return V;
19686 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19687 return V;
19688
19689 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19690 }
19691
19692 if (VT == MVT::v32f16) {
19693 V1 = DAG.getBitcast(MVT::v32i16, V1);
19694 V2 = DAG.getBitcast(MVT::v32i16, V2);
19695 return DAG.getBitcast(MVT::v32f16,
19696 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
19697 }
19698
19699 // Dispatch to each element type for lowering. If we don't have support for
19700 // specific element type shuffles at 512 bits, immediately split them and
19701 // lower them. Each lowering routine of a given type is allowed to assume that
19702 // the requisite ISA extensions for that element type are available.
19703 switch (VT.SimpleTy) {
19704 case MVT::v8f64:
19705 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19706 case MVT::v16f32:
19707 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19708 case MVT::v8i64:
19709 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19710 case MVT::v16i32:
19711 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19712 case MVT::v32i16:
19713 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19714 case MVT::v64i8:
19715 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19716
19717 default:
19718 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19718)
;
19719 }
19720}
19721
19722static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
19723 MVT VT, SDValue V1, SDValue V2,
19724 const X86Subtarget &Subtarget,
19725 SelectionDAG &DAG) {
19726 // Shuffle should be unary.
19727 if (!V2.isUndef())
19728 return SDValue();
19729
19730 int ShiftAmt = -1;
19731 int NumElts = Mask.size();
19732 for (int i = 0; i != NumElts; ++i) {
19733 int M = Mask[i];
19734 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__
__PRETTY_FUNCTION__))
19735 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__
__PRETTY_FUNCTION__))
;
19736 if (M < 0)
19737 continue;
19738
19739 // The first non-undef element determines our shift amount.
19740 if (ShiftAmt < 0) {
19741 ShiftAmt = M - i;
19742 // Need to be shifting right.
19743 if (ShiftAmt <= 0)
19744 return SDValue();
19745 }
19746 // All non-undef elements must shift by the same amount.
19747 if (ShiftAmt != M - i)
19748 return SDValue();
19749 }
19750 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19750, __extension__
__PRETTY_FUNCTION__))
;
19751
19752 // Great we found a shift right.
19753 MVT WideVT = VT;
19754 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19755 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19756 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19757 DAG.getUNDEF(WideVT), V1,
19758 DAG.getIntPtrConstant(0, DL));
19759 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
19760 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19761 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19762 DAG.getIntPtrConstant(0, DL));
19763}
19764
19765// Determine if this shuffle can be implemented with a KSHIFT instruction.
19766// Returns the shift amount if possible or -1 if not. This is a simplified
19767// version of matchShuffleAsShift.
19768static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
19769 int MaskOffset, const APInt &Zeroable) {
19770 int Size = Mask.size();
19771
19772 auto CheckZeros = [&](int Shift, bool Left) {
19773 for (int j = 0; j < Shift; ++j)
19774 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
19775 return false;
19776
19777 return true;
19778 };
19779
19780 auto MatchShift = [&](int Shift, bool Left) {
19781 unsigned Pos = Left ? Shift : 0;
19782 unsigned Low = Left ? 0 : Shift;
19783 unsigned Len = Size - Shift;
19784 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
19785 };
19786
19787 for (int Shift = 1; Shift != Size; ++Shift)
19788 for (bool Left : {true, false})
19789 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
19790 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
19791 return Shift;
19792 }
19793
19794 return -1;
19795}
19796
19797
19798// Lower vXi1 vector shuffles.
19799// There is no a dedicated instruction on AVX-512 that shuffles the masks.
19800// The only way to shuffle bits is to sign-extend the mask vector to SIMD
19801// vector, shuffle and then truncate it back.
19802static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19803 MVT VT, SDValue V1, SDValue V2,
19804 const APInt &Zeroable,
19805 const X86Subtarget &Subtarget,
19806 SelectionDAG &DAG) {
19807 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__
__PRETTY_FUNCTION__))
19808 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__
__PRETTY_FUNCTION__))
;
19809
19810 int NumElts = Mask.size();
19811
19812 // Try to recognize shuffles that are just padding a subvector with zeros.
19813 int SubvecElts = 0;
19814 int Src = -1;
19815 for (int i = 0; i != NumElts; ++i) {
19816 if (Mask[i] >= 0) {
19817 // Grab the source from the first valid mask. All subsequent elements need
19818 // to use this same source.
19819 if (Src < 0)
19820 Src = Mask[i] / NumElts;
19821 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
19822 break;
19823 }
19824
19825 ++SubvecElts;
19826 }
19827 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19827, __extension__
__PRETTY_FUNCTION__))
;
19828
19829 // Clip to a power 2.
19830 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
19831
19832 // Make sure the number of zeroable bits in the top at least covers the bits
19833 // not covered by the subvector.
19834 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
19835 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19835, __extension__
__PRETTY_FUNCTION__))
;
19836 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
19837 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
19838 Src == 0 ? V1 : V2,
19839 DAG.getIntPtrConstant(0, DL));
19840 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19841 DAG.getConstant(0, DL, VT),
19842 Extract, DAG.getIntPtrConstant(0, DL));
19843 }
19844
19845 // Try a simple shift right with undef elements. Later we'll try with zeros.
19846 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
19847 DAG))
19848 return Shift;
19849
19850 // Try to match KSHIFTs.
19851 unsigned Offset = 0;
19852 for (SDValue V : { V1, V2 }) {
19853 unsigned Opcode;
19854 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
19855 if (ShiftAmt >= 0) {
19856 MVT WideVT = VT;
19857 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19858 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19859 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19860 DAG.getUNDEF(WideVT), V,
19861 DAG.getIntPtrConstant(0, DL));
19862 // Widened right shifts need two shifts to ensure we shift in zeroes.
19863 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
19864 int WideElts = WideVT.getVectorNumElements();
19865 // Shift left to put the original vector in the MSBs of the new size.
19866 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
19867 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
19868 // Increase the shift amount to account for the left shift.
19869 ShiftAmt += WideElts - NumElts;
19870 }
19871
19872 Res = DAG.getNode(Opcode, DL, WideVT, Res,
19873 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19874 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19875 DAG.getIntPtrConstant(0, DL));
19876 }
19877 Offset += NumElts; // Increment for next iteration.
19878 }
19879
19880 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
19881 // TODO: What other unary shuffles would benefit from this?
19882 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
19883 V1->hasOneUse()) {
19884 SDValue Op0 = V1.getOperand(0);
19885 SDValue Op1 = V1.getOperand(1);
19886 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
19887 EVT OpVT = Op0.getValueType();
19888 return DAG.getSetCC(
19889 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
19890 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
19891 }
19892
19893 MVT ExtVT;
19894 switch (VT.SimpleTy) {
19895 default:
19896 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19896)
;
19897 case MVT::v2i1:
19898 ExtVT = MVT::v2i64;
19899 break;
19900 case MVT::v4i1:
19901 ExtVT = MVT::v4i32;
19902 break;
19903 case MVT::v8i1:
19904 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19905 // shuffle.
19906 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19907 break;
19908 case MVT::v16i1:
19909 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19910 // 256-bit operation available.
19911 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19912 break;
19913 case MVT::v32i1:
19914 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19915 // 256-bit operation available.
19916 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19916, __extension__
__PRETTY_FUNCTION__))
;
19917 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19918 break;
19919 case MVT::v64i1:
19920 // Fall back to scalarization. FIXME: We can do better if the shuffle
19921 // can be partitioned cleanly.
19922 if (!Subtarget.useBWIRegs())
19923 return SDValue();
19924 ExtVT = MVT::v64i8;
19925 break;
19926 }
19927
19928 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
19929 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
19930
19931 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
19932 // i1 was sign extended we can use X86ISD::CVT2MASK.
19933 int NumElems = VT.getVectorNumElements();
19934 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
19935 (Subtarget.hasDQI() && (NumElems < 32)))
19936 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
19937 Shuffle, ISD::SETGT);
19938
19939 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
19940}
19941
19942/// Helper function that returns true if the shuffle mask should be
19943/// commuted to improve canonicalization.
19944static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
19945 int NumElements = Mask.size();
19946
19947 int NumV1Elements = 0, NumV2Elements = 0;
19948 for (int M : Mask)
19949 if (M < 0)
19950 continue;
19951 else if (M < NumElements)
19952 ++NumV1Elements;
19953 else
19954 ++NumV2Elements;
19955
19956 // Commute the shuffle as needed such that more elements come from V1 than
19957 // V2. This allows us to match the shuffle pattern strictly on how many
19958 // elements come from V1 without handling the symmetric cases.
19959 if (NumV2Elements > NumV1Elements)
19960 return true;
19961
19962 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19962, __extension__
__PRETTY_FUNCTION__))
;
19963
19964 if (NumV2Elements == 0)
19965 return false;
19966
19967 // When the number of V1 and V2 elements are the same, try to minimize the
19968 // number of uses of V2 in the low half of the vector. When that is tied,
19969 // ensure that the sum of indices for V1 is equal to or lower than the sum
19970 // indices for V2. When those are equal, try to ensure that the number of odd
19971 // indices for V1 is lower than the number of odd indices for V2.
19972 if (NumV1Elements == NumV2Elements) {
19973 int LowV1Elements = 0, LowV2Elements = 0;
19974 for (int M : Mask.slice(0, NumElements / 2))
19975 if (M >= NumElements)
19976 ++LowV2Elements;
19977 else if (M >= 0)
19978 ++LowV1Elements;
19979 if (LowV2Elements > LowV1Elements)
19980 return true;
19981 if (LowV2Elements == LowV1Elements) {
19982 int SumV1Indices = 0, SumV2Indices = 0;
19983 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19984 if (Mask[i] >= NumElements)
19985 SumV2Indices += i;
19986 else if (Mask[i] >= 0)
19987 SumV1Indices += i;
19988 if (SumV2Indices < SumV1Indices)
19989 return true;
19990 if (SumV2Indices == SumV1Indices) {
19991 int NumV1OddIndices = 0, NumV2OddIndices = 0;
19992 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19993 if (Mask[i] >= NumElements)
19994 NumV2OddIndices += i % 2;
19995 else if (Mask[i] >= 0)
19996 NumV1OddIndices += i % 2;
19997 if (NumV2OddIndices < NumV1OddIndices)
19998 return true;
19999 }
20000 }
20001 }
20002
20003 return false;
20004}
20005
20006static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,
20007 const X86Subtarget &Subtarget) {
20008 if (!Subtarget.hasAVX512())
20009 return false;
20010
20011 MVT VT = V1.getSimpleValueType().getScalarType();
20012 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
20013 return false;
20014
20015 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
20016 // are preferable to blendw/blendvb/masked-mov.
20017 if ((VT == MVT::i16 || VT == MVT::i8) &&
20018 V1.getSimpleValueType().getSizeInBits() < 512)
20019 return false;
20020
20021 auto HasMaskOperation = [&](SDValue V) {
20022 // TODO: Currently we only check limited opcode. We probably extend
20023 // it to all binary operation by checking TLI.isBinOp().
20024 switch (V->getOpcode()) {
20025 default:
20026 return false;
20027 case ISD::ADD:
20028 case ISD::SUB:
20029 case ISD::AND:
20030 case ISD::XOR:
20031 case ISD::OR:
20032 case ISD::SMAX:
20033 case ISD::SMIN:
20034 case ISD::UMAX:
20035 case ISD::UMIN:
20036 case ISD::ABS:
20037 case ISD::SHL:
20038 case ISD::SRL:
20039 case ISD::SRA:
20040 case ISD::MUL:
20041 break;
20042 }
20043 if (!V->hasOneUse())
20044 return false;
20045
20046 return true;
20047 };
20048
20049 if (HasMaskOperation(V1) || HasMaskOperation(V2))
20050 return true;
20051
20052 return false;
20053}
20054
20055// Forward declaration.
20056static SDValue canonicalizeShuffleMaskWithHorizOp(
20057 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
20058 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
20059 const X86Subtarget &Subtarget);
20060
20061 /// Top-level lowering for x86 vector shuffles.
20062///
20063/// This handles decomposition, canonicalization, and lowering of all x86
20064/// vector shuffles. Most of the specific lowering strategies are encapsulated
20065/// above in helper routines. The canonicalization attempts to widen shuffles
20066/// to involve fewer lanes of wider elements, consolidate symmetric patterns
20067/// s.t. only one of the two inputs needs to be tested, etc.
20068static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
20069 SelectionDAG &DAG) {
20070 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
20071 ArrayRef<int> OrigMask = SVOp->getMask();
20072 SDValue V1 = Op.getOperand(0);
20073 SDValue V2 = Op.getOperand(1);
20074 MVT VT = Op.getSimpleValueType();
20075 int NumElements = VT.getVectorNumElements();
20076 SDLoc DL(Op);
20077 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
20078
20079 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__
__PRETTY_FUNCTION__))
20080 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__
__PRETTY_FUNCTION__))
;
20081
20082 bool V1IsUndef = V1.isUndef();
20083 bool V2IsUndef = V2.isUndef();
20084 if (V1IsUndef && V2IsUndef)
20085 return DAG.getUNDEF(VT);
20086
20087 // When we create a shuffle node we put the UNDEF node to second operand,
20088 // but in some cases the first operand may be transformed to UNDEF.
20089 // In this case we should just commute the node.
20090 if (V1IsUndef)
20091 return DAG.getCommutedVectorShuffle(*SVOp);
20092
20093 // Check for non-undef masks pointing at an undef vector and make the masks
20094 // undef as well. This makes it easier to match the shuffle based solely on
20095 // the mask.
20096 if (V2IsUndef &&
20097 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
20098 SmallVector<int, 8> NewMask(OrigMask);
20099 for (int &M : NewMask)
20100 if (M >= NumElements)
20101 M = -1;
20102 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
20103 }
20104
20105 // Check for illegal shuffle mask element index values.
20106 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
20107 (void)MaskUpperLimit;
20108 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))
20109 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))
20110 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))
;
20111
20112 // We actually see shuffles that are entirely re-arrangements of a set of
20113 // zero inputs. This mostly happens while decomposing complex shuffles into
20114 // simple ones. Directly lower these as a buildvector of zeros.
20115 APInt KnownUndef, KnownZero;
20116 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
20117
20118 APInt Zeroable = KnownUndef | KnownZero;
20119 if (Zeroable.isAllOnes())
20120 return getZeroVector(VT, Subtarget, DAG, DL);
20121
20122 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
20123
20124 // Try to collapse shuffles into using a vector type with fewer elements but
20125 // wider element types. We cap this to not form integers or floating point
20126 // elements wider than 64 bits. It does not seem beneficial to form i128
20127 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
20128 SmallVector<int, 16> WidenedMask;
20129 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
20130 !canCombineAsMaskOperation(V1, V2, Subtarget) &&
20131 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
20132 // Shuffle mask widening should not interfere with a broadcast opportunity
20133 // by obfuscating the operands with bitcasts.
20134 // TODO: Avoid lowering directly from this top-level function: make this
20135 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
20136 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
20137 Subtarget, DAG))
20138 return Broadcast;
20139
20140 MVT NewEltVT = VT.isFloatingPoint()
20141 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
20142 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
20143 int NewNumElts = NumElements / 2;
20144 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
20145 // Make sure that the new vector type is legal. For example, v2f64 isn't
20146 // legal on SSE1.
20147 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
20148 if (V2IsZero) {
20149 // Modify the new Mask to take all zeros from the all-zero vector.
20150 // Choose indices that are blend-friendly.
20151 bool UsedZeroVector = false;
20152 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__
__PRETTY_FUNCTION__))
20153 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__
__PRETTY_FUNCTION__))
;
20154 for (int i = 0; i != NewNumElts; ++i)
20155 if (WidenedMask[i] == SM_SentinelZero) {
20156 WidenedMask[i] = i + NewNumElts;
20157 UsedZeroVector = true;
20158 }
20159 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
20160 // some elements to be undef.
20161 if (UsedZeroVector)
20162 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
20163 }
20164 V1 = DAG.getBitcast(NewVT, V1);
20165 V2 = DAG.getBitcast(NewVT, V2);
20166 return DAG.getBitcast(
20167 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
20168 }
20169 }
20170
20171 SmallVector<SDValue> Ops = {V1, V2};
20172 SmallVector<int> Mask(OrigMask);
20173
20174 // Canonicalize the shuffle with any horizontal ops inputs.
20175 // NOTE: This may update Ops and Mask.
20176 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
20177 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
20178 return DAG.getBitcast(VT, HOp);
20179
20180 V1 = DAG.getBitcast(VT, Ops[0]);
20181 V2 = DAG.getBitcast(VT, Ops[1]);
20182 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))
20183 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))
20184 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))
;
20185
20186 // Commute the shuffle if it will improve canonicalization.
20187 if (canonicalizeShuffleMaskWithCommute(Mask)) {
20188 ShuffleVectorSDNode::commuteMask(Mask);
20189 std::swap(V1, V2);
20190 }
20191
20192 // For each vector width, delegate to a specialized lowering routine.
20193 if (VT.is128BitVector())
20194 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20195
20196 if (VT.is256BitVector())
20197 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20198
20199 if (VT.is512BitVector())
20200 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20201
20202 if (Is1BitVector)
20203 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20204
20205 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20205)
;
20206}
20207
20208/// Try to lower a VSELECT instruction to a vector shuffle.
20209static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
20210 const X86Subtarget &Subtarget,
20211 SelectionDAG &DAG) {
20212 SDValue Cond = Op.getOperand(0);
20213 SDValue LHS = Op.getOperand(1);
20214 SDValue RHS = Op.getOperand(2);
20215 MVT VT = Op.getSimpleValueType();
20216
20217 // Only non-legal VSELECTs reach this lowering, convert those into generic
20218 // shuffles and re-use the shuffle lowering path for blends.
20219 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
20220 SmallVector<int, 32> Mask;
20221 if (createShuffleMaskFromVSELECT(Mask, Cond))
20222 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
20223 }
20224
20225 return SDValue();
20226}
20227
20228SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
20229 SDValue Cond = Op.getOperand(0);
20230 SDValue LHS = Op.getOperand(1);
20231 SDValue RHS = Op.getOperand(2);
20232
20233 SDLoc dl(Op);
20234 MVT VT = Op.getSimpleValueType();
20235 if (isSoftFP16(VT)) {
20236 MVT NVT = VT.changeVectorElementTypeToInteger();
20237 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
20238 DAG.getBitcast(NVT, LHS),
20239 DAG.getBitcast(NVT, RHS)));
20240 }
20241
20242 // A vselect where all conditions and data are constants can be optimized into
20243 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
20244 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
20245 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
20246 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
20247 return SDValue();
20248
20249 // Try to lower this to a blend-style vector shuffle. This can handle all
20250 // constant condition cases.
20251 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
20252 return BlendOp;
20253
20254 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
20255 // with patterns on the mask registers on AVX-512.
20256 MVT CondVT = Cond.getSimpleValueType();
20257 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
20258 if (CondEltSize == 1)
20259 return Op;
20260
20261 // Variable blends are only legal from SSE4.1 onward.
20262 if (!Subtarget.hasSSE41())
20263 return SDValue();
20264
20265 unsigned EltSize = VT.getScalarSizeInBits();
20266 unsigned NumElts = VT.getVectorNumElements();
20267
20268 // Expand v32i16/v64i8 without BWI.
20269 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
20270 return SDValue();
20271
20272 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
20273 // into an i1 condition so that we can use the mask-based 512-bit blend
20274 // instructions.
20275 if (VT.getSizeInBits() == 512) {
20276 // Build a mask by testing the condition against zero.
20277 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
20278 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
20279 DAG.getConstant(0, dl, CondVT),
20280 ISD::SETNE);
20281 // Now return a new VSELECT using the mask.
20282 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
20283 }
20284
20285 // SEXT/TRUNC cases where the mask doesn't match the destination size.
20286 if (CondEltSize != EltSize) {
20287 // If we don't have a sign splat, rely on the expansion.
20288 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
20289 return SDValue();
20290
20291 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
20292 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
20293 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
20294 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
20295 }
20296
20297 // Only some types will be legal on some subtargets. If we can emit a legal
20298 // VSELECT-matching blend, return Op, and but if we need to expand, return
20299 // a null value.
20300 switch (VT.SimpleTy) {
20301 default:
20302 // Most of the vector types have blends past SSE4.1.
20303 return Op;
20304
20305 case MVT::v32i8:
20306 // The byte blends for AVX vectors were introduced only in AVX2.
20307 if (Subtarget.hasAVX2())
20308 return Op;
20309
20310 return SDValue();
20311
20312 case MVT::v8i16:
20313 case MVT::v16i16: {
20314 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
20315 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
20316 Cond = DAG.getBitcast(CastVT, Cond);
20317 LHS = DAG.getBitcast(CastVT, LHS);
20318 RHS = DAG.getBitcast(CastVT, RHS);
20319 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
20320 return DAG.getBitcast(VT, Select);
20321 }
20322 }
20323}
20324
20325static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
20326 MVT VT = Op.getSimpleValueType();
20327 SDValue Vec = Op.getOperand(0);
20328 SDValue Idx = Op.getOperand(1);
20329 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20329, __extension__
__PRETTY_FUNCTION__))
;
20330 SDLoc dl(Op);
20331
20332 if (!Vec.getSimpleValueType().is128BitVector())
20333 return SDValue();
20334
20335 if (VT.getSizeInBits() == 8) {
20336 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
20337 // we're going to zero extend the register or fold the store.
20338 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
20339 !X86::mayFoldIntoStore(Op))
20340 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
20341 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20342 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20343
20344 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
20345 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
20346 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20347 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20348 }
20349
20350 if (VT == MVT::f32) {
20351 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
20352 // the result back to FR32 register. It's only worth matching if the
20353 // result has a single use which is a store or a bitcast to i32. And in
20354 // the case of a store, it's not worth it if the index is a constant 0,
20355 // because a MOVSSmr can be used instead, which is smaller and faster.
20356 if (!Op.hasOneUse())
20357 return SDValue();
20358 SDNode *User = *Op.getNode()->use_begin();
20359 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
20360 (User->getOpcode() != ISD::BITCAST ||
20361 User->getValueType(0) != MVT::i32))
20362 return SDValue();
20363 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20364 DAG.getBitcast(MVT::v4i32, Vec), Idx);
20365 return DAG.getBitcast(MVT::f32, Extract);
20366 }
20367
20368 if (VT == MVT::i32 || VT == MVT::i64)
20369 return Op;
20370
20371 return SDValue();
20372}
20373
20374/// Extract one bit from mask vector, like v16i1 or v8i1.
20375/// AVX-512 feature.
20376static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
20377 const X86Subtarget &Subtarget) {
20378 SDValue Vec = Op.getOperand(0);
20379 SDLoc dl(Vec);
20380 MVT VecVT = Vec.getSimpleValueType();
20381 SDValue Idx = Op.getOperand(1);
20382 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20383 MVT EltVT = Op.getSimpleValueType();
20384
20385 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__
__PRETTY_FUNCTION__))
20386 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__
__PRETTY_FUNCTION__))
;
20387
20388 // variable index can't be handled in mask registers,
20389 // extend vector to VR512/128
20390 if (!IdxC) {
20391 unsigned NumElts = VecVT.getVectorNumElements();
20392 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
20393 // than extending to 128/256bit.
20394 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20395 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20396 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
20397 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
20398 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
20399 }
20400
20401 unsigned IdxVal = IdxC->getZExtValue();
20402 if (IdxVal == 0) // the operation is legal
20403 return Op;
20404
20405 // Extend to natively supported kshift.
20406 unsigned NumElems = VecVT.getVectorNumElements();
20407 MVT WideVecVT = VecVT;
20408 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20409 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20410 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20411 DAG.getUNDEF(WideVecVT), Vec,
20412 DAG.getIntPtrConstant(0, dl));
20413 }
20414
20415 // Use kshiftr instruction to move to the lower element.
20416 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20417 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20418
20419 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20420 DAG.getIntPtrConstant(0, dl));
20421}
20422
20423SDValue
20424X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
20425 SelectionDAG &DAG) const {
20426 SDLoc dl(Op);
20427 SDValue Vec = Op.getOperand(0);
20428 MVT VecVT = Vec.getSimpleValueType();
20429 SDValue Idx = Op.getOperand(1);
20430 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20431
20432 if (VecVT.getVectorElementType() == MVT::i1)
20433 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
20434
20435 if (!IdxC) {
20436 // Its more profitable to go through memory (1 cycles throughput)
20437 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
20438 // IACA tool was used to get performance estimation
20439 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
20440 //
20441 // example : extractelement <16 x i8> %a, i32 %i
20442 //
20443 // Block Throughput: 3.00 Cycles
20444 // Throughput Bottleneck: Port5
20445 //
20446 // | Num Of | Ports pressure in cycles | |
20447 // | Uops | 0 - DV | 5 | 6 | 7 | |
20448 // ---------------------------------------------
20449 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
20450 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
20451 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
20452 // Total Num Of Uops: 4
20453 //
20454 //
20455 // Block Throughput: 1.00 Cycles
20456 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
20457 //
20458 // | | Ports pressure in cycles | |
20459 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
20460 // ---------------------------------------------------------
20461 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
20462 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
20463 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
20464 // Total Num Of Uops: 4
20465
20466 return SDValue();
20467 }
20468
20469 unsigned IdxVal = IdxC->getZExtValue();
20470
20471 // If this is a 256-bit vector result, first extract the 128-bit vector and
20472 // then extract the element from the 128-bit vector.
20473 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
20474 // Get the 128-bit vector.
20475 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
20476 MVT EltVT = VecVT.getVectorElementType();
20477
20478 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
20479 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20479, __extension__
__PRETTY_FUNCTION__))
;
20480
20481 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
20482 // this can be done with a mask.
20483 IdxVal &= ElemsPerChunk - 1;
20484 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20485 DAG.getIntPtrConstant(IdxVal, dl));
20486 }
20487
20488 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20488, __extension__
__PRETTY_FUNCTION__))
;
20489
20490 MVT VT = Op.getSimpleValueType();
20491
20492 if (VT == MVT::i16) {
20493 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
20494 // we're going to zero extend the register or fold the store (SSE41 only).
20495 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
20496 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
20497 if (Subtarget.hasFP16())
20498 return Op;
20499
20500 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
20501 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20502 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20503 }
20504
20505 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
20506 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20507 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20508 }
20509
20510 if (Subtarget.hasSSE41())
20511 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
20512 return Res;
20513
20514 // TODO: We only extract a single element from v16i8, we can probably afford
20515 // to be more aggressive here before using the default approach of spilling to
20516 // stack.
20517 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
20518 // Extract either the lowest i32 or any i16, and extract the sub-byte.
20519 int DWordIdx = IdxVal / 4;
20520 if (DWordIdx == 0) {
20521 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20522 DAG.getBitcast(MVT::v4i32, Vec),
20523 DAG.getIntPtrConstant(DWordIdx, dl));
20524 int ShiftVal = (IdxVal % 4) * 8;
20525 if (ShiftVal != 0)
20526 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
20527 DAG.getConstant(ShiftVal, dl, MVT::i8));
20528 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20529 }
20530
20531 int WordIdx = IdxVal / 2;
20532 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
20533 DAG.getBitcast(MVT::v8i16, Vec),
20534 DAG.getIntPtrConstant(WordIdx, dl));
20535 int ShiftVal = (IdxVal % 2) * 8;
20536 if (ShiftVal != 0)
20537 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
20538 DAG.getConstant(ShiftVal, dl, MVT::i8));
20539 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20540 }
20541
20542 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
20543 if (IdxVal == 0)
20544 return Op;
20545
20546 // Shuffle the element to the lowest element, then movss or movsh.
20547 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
20548 Mask[0] = static_cast<int>(IdxVal);
20549 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20550 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20551 DAG.getIntPtrConstant(0, dl));
20552 }
20553
20554 if (VT.getSizeInBits() == 64) {
20555 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
20556 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
20557 // to match extract_elt for f64.
20558 if (IdxVal == 0)
20559 return Op;
20560
20561 // UNPCKHPD the element to the lowest double word, then movsd.
20562 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
20563 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
20564 int Mask[2] = { 1, -1 };
20565 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20566 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20567 DAG.getIntPtrConstant(0, dl));
20568 }
20569
20570 return SDValue();
20571}
20572
20573/// Insert one bit to mask vector, like v16i1 or v8i1.
20574/// AVX-512 feature.
20575static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
20576 const X86Subtarget &Subtarget) {
20577 SDLoc dl(Op);
20578 SDValue Vec = Op.getOperand(0);
20579 SDValue Elt = Op.getOperand(1);
20580 SDValue Idx = Op.getOperand(2);
20581 MVT VecVT = Vec.getSimpleValueType();
20582
20583 if (!isa<ConstantSDNode>(Idx)) {
20584 // Non constant index. Extend source and destination,
20585 // insert element and then truncate the result.
20586 unsigned NumElts = VecVT.getVectorNumElements();
20587 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20588 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20589 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
20590 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
20591 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
20592 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
20593 }
20594
20595 // Copy into a k-register, extract to v1i1 and insert_subvector.
20596 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
20597 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
20598}
20599
20600SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
20601 SelectionDAG &DAG) const {
20602 MVT VT = Op.getSimpleValueType();
20603 MVT EltVT = VT.getVectorElementType();
20604 unsigned NumElts = VT.getVectorNumElements();
20605 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
20606
20607 if (EltVT == MVT::i1)
20608 return InsertBitToMaskVector(Op, DAG, Subtarget);
20609
20610 SDLoc dl(Op);
20611 SDValue N0 = Op.getOperand(0);
20612 SDValue N1 = Op.getOperand(1);
20613 SDValue N2 = Op.getOperand(2);
20614 auto *N2C = dyn_cast<ConstantSDNode>(N2);
20615
20616 if (!N2C) {
20617 // Variable insertion indices, usually we're better off spilling to stack,
20618 // but AVX512 can use a variable compare+select by comparing against all
20619 // possible vector indices, and FP insertion has less gpr->simd traffic.
20620 if (!(Subtarget.hasBWI() ||
20621 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
20622 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
20623 return SDValue();
20624
20625 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
20626 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
20627 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
20628 return SDValue();
20629
20630 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
20631 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
20632 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
20633
20634 SmallVector<SDValue, 16> RawIndices;
20635 for (unsigned I = 0; I != NumElts; ++I)
20636 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
20637 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
20638
20639 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
20640 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
20641 ISD::CondCode::SETEQ);
20642 }
20643
20644 if (N2C->getAPIntValue().uge(NumElts))
20645 return SDValue();
20646 uint64_t IdxVal = N2C->getZExtValue();
20647
20648 bool IsZeroElt = X86::isZeroNode(N1);
20649 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
20650
20651 if (IsZeroElt || IsAllOnesElt) {
20652 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
20653 // We don't deal with i8 0 since it appears to be handled elsewhere.
20654 if (IsAllOnesElt &&
20655 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
20656 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
20657 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
20658 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
20659 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
20660 CstVectorElts[IdxVal] = OnesCst;
20661 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
20662 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
20663 }
20664 // See if we can do this more efficiently with a blend shuffle with a
20665 // rematerializable vector.
20666 if (Subtarget.hasSSE41() &&
20667 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
20668 SmallVector<int, 8> BlendMask;
20669 for (unsigned i = 0; i != NumElts; ++i)
20670 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20671 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
20672 : getOnesVector(VT, DAG, dl);
20673 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
20674 }
20675 }
20676
20677 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
20678 // into that, and then insert the subvector back into the result.
20679 if (VT.is256BitVector() || VT.is512BitVector()) {
20680 // With a 256-bit vector, we can insert into the zero element efficiently
20681 // using a blend if we have AVX or AVX2 and the right data type.
20682 if (VT.is256BitVector() && IdxVal == 0) {
20683 // TODO: It is worthwhile to cast integer to floating point and back
20684 // and incur a domain crossing penalty if that's what we'll end up
20685 // doing anyway after extracting to a 128-bit vector.
20686 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
20687 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
20688 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20689 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
20690 DAG.getTargetConstant(1, dl, MVT::i8));
20691 }
20692 }
20693
20694 unsigned NumEltsIn128 = 128 / EltSizeInBits;
20695 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__
__PRETTY_FUNCTION__))
20696 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__
__PRETTY_FUNCTION__))
;
20697
20698 // If we are not inserting into the low 128-bit vector chunk,
20699 // then prefer the broadcast+blend sequence.
20700 // FIXME: relax the profitability check iff all N1 uses are insertions.
20701 if (IdxVal >= NumEltsIn128 &&
20702 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
20703 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
20704 X86::mayFoldLoad(N1, Subtarget)))) {
20705 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
20706 SmallVector<int, 8> BlendMask;
20707 for (unsigned i = 0; i != NumElts; ++i)
20708 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20709 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
20710 }
20711
20712 // Get the desired 128-bit vector chunk.
20713 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
20714
20715 // Insert the element into the desired chunk.
20716 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
20717 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
20718
20719 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
20720 DAG.getIntPtrConstant(IdxIn128, dl));
20721
20722 // Insert the changed part back into the bigger vector
20723 return insert128BitVector(N0, V, IdxVal, DAG, dl);
20724 }
20725 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20725, __extension__
__PRETTY_FUNCTION__))
;
20726
20727 // This will be just movw/movd/movq/movsh/movss/movsd.
20728 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
20729 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
20730 EltVT == MVT::f16 || EltVT == MVT::i64) {
20731 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20732 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20733 }
20734
20735 // We can't directly insert an i8 or i16 into a vector, so zero extend
20736 // it to i32 first.
20737 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
20738 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
20739 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20740 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
20741 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20742 return DAG.getBitcast(VT, N1);
20743 }
20744 }
20745
20746 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
20747 // argument. SSE41 required for pinsrb.
20748 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
20749 unsigned Opc;
20750 if (VT == MVT::v8i16) {
20751 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20751, __extension__
__PRETTY_FUNCTION__))
;
20752 Opc = X86ISD::PINSRW;
20753 } else {
20754 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20754, __extension__
__PRETTY_FUNCTION__))
;
20755 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20755, __extension__
__PRETTY_FUNCTION__))
;
20756 Opc = X86ISD::PINSRB;
20757 }
20758
20759 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20759, __extension__
__PRETTY_FUNCTION__))
;
20760 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
20761 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
20762 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
20763 }
20764
20765 if (Subtarget.hasSSE41()) {
20766 if (EltVT == MVT::f32) {
20767 // Bits [7:6] of the constant are the source select. This will always be
20768 // zero here. The DAG Combiner may combine an extract_elt index into
20769 // these bits. For example (insert (extract, 3), 2) could be matched by
20770 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
20771 // Bits [5:4] of the constant are the destination select. This is the
20772 // value of the incoming immediate.
20773 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
20774 // combine either bitwise AND or insert of float 0.0 to set these bits.
20775
20776 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
20777 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
20778 // If this is an insertion of 32-bits into the low 32-bits of
20779 // a vector, we prefer to generate a blend with immediate rather
20780 // than an insertps. Blends are simpler operations in hardware and so
20781 // will always have equal or better performance than insertps.
20782 // But if optimizing for size and there's a load folding opportunity,
20783 // generate insertps because blendps does not have a 32-bit memory
20784 // operand form.
20785 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20786 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
20787 DAG.getTargetConstant(1, dl, MVT::i8));
20788 }
20789 // Create this as a scalar to vector..
20790 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20791 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
20792 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
20793 }
20794
20795 // PINSR* works with constant index.
20796 if (EltVT == MVT::i32 || EltVT == MVT::i64)
20797 return Op;
20798 }
20799
20800 return SDValue();
20801}
20802
20803static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
20804 SelectionDAG &DAG) {
20805 SDLoc dl(Op);
20806 MVT OpVT = Op.getSimpleValueType();
20807
20808 // It's always cheaper to replace a xor+movd with xorps and simplifies further
20809 // combines.
20810 if (X86::isZeroNode(Op.getOperand(0)))
20811 return getZeroVector(OpVT, Subtarget, DAG, dl);
20812
20813 // If this is a 256-bit vector result, first insert into a 128-bit
20814 // vector and then insert into the 256-bit vector.
20815 if (!OpVT.is128BitVector()) {
20816 // Insert into a 128-bit vector.
20817 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
20818 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
20819 OpVT.getVectorNumElements() / SizeFactor);
20820
20821 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
20822
20823 // Insert the 128-bit vector.
20824 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
20825 }
20826 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__
__PRETTY_FUNCTION__))
20827 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__
__PRETTY_FUNCTION__))
;
20828
20829 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
20830 // tblgen.
20831 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
20832 return Op;
20833
20834 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
20835 return DAG.getBitcast(
20836 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
20837}
20838
20839// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
20840// simple superregister reference or explicit instructions to insert
20841// the upper bits of a vector.
20842static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20843 SelectionDAG &DAG) {
20844 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20844, __extension__
__PRETTY_FUNCTION__))
;
20845
20846 return insert1BitVector(Op, DAG, Subtarget);
20847}
20848
20849static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20850 SelectionDAG &DAG) {
20851 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__
__PRETTY_FUNCTION__))
20852 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__
__PRETTY_FUNCTION__))
;
20853
20854 SDLoc dl(Op);
20855 SDValue Vec = Op.getOperand(0);
20856 uint64_t IdxVal = Op.getConstantOperandVal(1);
20857
20858 if (IdxVal == 0) // the operation is legal
20859 return Op;
20860
20861 MVT VecVT = Vec.getSimpleValueType();
20862 unsigned NumElems = VecVT.getVectorNumElements();
20863
20864 // Extend to natively supported kshift.
20865 MVT WideVecVT = VecVT;
20866 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20867 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20868 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20869 DAG.getUNDEF(WideVecVT), Vec,
20870 DAG.getIntPtrConstant(0, dl));
20871 }
20872
20873 // Shift to the LSB.
20874 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20875 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20876
20877 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
20878 DAG.getIntPtrConstant(0, dl));
20879}
20880
20881// Returns the appropriate wrapper opcode for a global reference.
20882unsigned X86TargetLowering::getGlobalWrapperKind(
20883 const GlobalValue *GV, const unsigned char OpFlags) const {
20884 // References to absolute symbols are never PC-relative.
20885 if (GV && GV->isAbsoluteSymbolRef())
20886 return X86ISD::Wrapper;
20887
20888 CodeModel::Model M = getTargetMachine().getCodeModel();
20889 if (Subtarget.isPICStyleRIPRel() &&
20890 (M == CodeModel::Small || M == CodeModel::Kernel))
20891 return X86ISD::WrapperRIP;
20892
20893 // In the medium model, functions can always be referenced RIP-relatively,
20894 // since they must be within 2GiB. This is also possible in non-PIC mode, and
20895 // shorter than the 64-bit absolute immediate that would otherwise be emitted.
20896 if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))
20897 return X86ISD::WrapperRIP;
20898
20899 // GOTPCREL references must always use RIP.
20900 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
20901 return X86ISD::WrapperRIP;
20902
20903 return X86ISD::Wrapper;
20904}
20905
20906// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
20907// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
20908// one of the above mentioned nodes. It has to be wrapped because otherwise
20909// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
20910// be used to form addressing mode. These wrapped nodes will be selected
20911// into MOV32ri.
20912SDValue
20913X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
20914 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
20915
20916 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20917 // global base reg.
20918 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20919
20920 auto PtrVT = getPointerTy(DAG.getDataLayout());
20921 SDValue Result = DAG.getTargetConstantPool(
20922 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
20923 SDLoc DL(CP);
20924 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20925 // With PIC, the address is actually $g + Offset.
20926 if (OpFlag) {
20927 Result =
20928 DAG.getNode(ISD::ADD, DL, PtrVT,
20929 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20930 }
20931
20932 return Result;
20933}
20934
20935SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
20936 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
20937
20938 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20939 // global base reg.
20940 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20941
20942 auto PtrVT = getPointerTy(DAG.getDataLayout());
20943 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
20944 SDLoc DL(JT);
20945 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20946
20947 // With PIC, the address is actually $g + Offset.
20948 if (OpFlag)
20949 Result =
20950 DAG.getNode(ISD::ADD, DL, PtrVT,
20951 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20952
20953 return Result;
20954}
20955
20956SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
20957 SelectionDAG &DAG) const {
20958 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20959}
20960
20961SDValue
20962X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
20963 // Create the TargetBlockAddressAddress node.
20964 unsigned char OpFlags =
20965 Subtarget.classifyBlockAddressReference();
20966 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
20967 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
20968 SDLoc dl(Op);
20969 auto PtrVT = getPointerTy(DAG.getDataLayout());
20970 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20971 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
20972
20973 // With PIC, the address is actually $g + Offset.
20974 if (isGlobalRelativeToPICBase(OpFlags)) {
20975 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20976 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20977 }
20978
20979 return Result;
20980}
20981
20982/// Creates target global address or external symbol nodes for calls or
20983/// other uses.
20984SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20985 bool ForCall) const {
20986 // Unpack the global address or external symbol.
20987 const SDLoc &dl = SDLoc(Op);
20988 const GlobalValue *GV = nullptr;
20989 int64_t Offset = 0;
20990 const char *ExternalSym = nullptr;
20991 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20992 GV = G->getGlobal();
20993 Offset = G->getOffset();
20994 } else {
20995 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20996 ExternalSym = ES->getSymbol();
20997 }
20998
20999 // Calculate some flags for address lowering.
21000 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
21001 unsigned char OpFlags;
21002 if (ForCall)
21003 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
21004 else
21005 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
21006 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
21007 bool NeedsLoad = isGlobalStubReference(OpFlags);
21008
21009 CodeModel::Model M = DAG.getTarget().getCodeModel();
21010 auto PtrVT = getPointerTy(DAG.getDataLayout());
21011 SDValue Result;
21012
21013 if (GV) {
21014 // Create a target global address if this is a global. If possible, fold the
21015 // offset into the global address reference. Otherwise, ADD it on later.
21016 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
21017 // allowed because if the address of foo is 0, the ELF R_X86_64_32
21018 // relocation will compute to a negative value, which is invalid.
21019 int64_t GlobalOffset = 0;
21020 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
21021 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
21022 std::swap(GlobalOffset, Offset);
21023 }
21024 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
21025 } else {
21026 // If this is not a global address, this must be an external symbol.
21027 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
21028 }
21029
21030 // If this is a direct call, avoid the wrapper if we don't need to do any
21031 // loads or adds. This allows SDAG ISel to match direct calls.
21032 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
21033 return Result;
21034
21035 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
21036
21037 // With PIC, the address is actually $g + Offset.
21038 if (HasPICReg) {
21039 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
21040 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
21041 }
21042
21043 // For globals that require a load from a stub to get the address, emit the
21044 // load.
21045 if (NeedsLoad)
21046 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
21047 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21048
21049 // If there was a non-zero offset that we didn't fold, create an explicit
21050 // addition for it.
21051 if (Offset != 0)
21052 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
21053 DAG.getConstant(Offset, dl, PtrVT));
21054
21055 return Result;
21056}
21057
21058SDValue
21059X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
21060 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
21061}
21062
21063static SDValue
21064GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
21065 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
21066 unsigned char OperandFlags, bool LocalDynamic = false) {
21067 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21068 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21069 SDLoc dl(GA);
21070 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21071 GA->getValueType(0),
21072 GA->getOffset(),
21073 OperandFlags);
21074
21075 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
21076 : X86ISD::TLSADDR;
21077
21078 if (InGlue) {
21079 SDValue Ops[] = { Chain, TGA, *InGlue };
21080 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21081 } else {
21082 SDValue Ops[] = { Chain, TGA };
21083 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21084 }
21085
21086 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
21087 MFI.setAdjustsStack(true);
21088 MFI.setHasCalls(true);
21089
21090 SDValue Glue = Chain.getValue(1);
21091 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
21092}
21093
21094// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
21095static SDValue
21096LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21097 const EVT PtrVT) {
21098 SDValue InGlue;
21099 SDLoc dl(GA); // ? function entry point might be better
21100 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21101 DAG.getNode(X86ISD::GlobalBaseReg,
21102 SDLoc(), PtrVT), InGlue);
21103 InGlue = Chain.getValue(1);
21104
21105 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
21106}
21107
21108// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
21109static SDValue
21110LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21111 const EVT PtrVT) {
21112 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21113 X86::RAX, X86II::MO_TLSGD);
21114}
21115
21116// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
21117static SDValue
21118LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21119 const EVT PtrVT) {
21120 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21121 X86::EAX, X86II::MO_TLSGD);
21122}
21123
21124static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
21125 SelectionDAG &DAG, const EVT PtrVT,
21126 bool Is64Bit, bool Is64BitLP64) {
21127 SDLoc dl(GA);
21128
21129 // Get the start address of the TLS block for this module.
21130 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
21131 .getInfo<X86MachineFunctionInfo>();
21132 MFI->incNumLocalDynamicTLSAccesses();
21133
21134 SDValue Base;
21135 if (Is64Bit) {
21136 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
21137 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
21138 X86II::MO_TLSLD, /*LocalDynamic=*/true);
21139 } else {
21140 SDValue InGlue;
21141 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21142 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
21143 InGlue = Chain.getValue(1);
21144 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
21145 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
21146 }
21147
21148 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
21149 // of Base.
21150
21151 // Build x@dtpoff.
21152 unsigned char OperandFlags = X86II::MO_DTPOFF;
21153 unsigned WrapperKind = X86ISD::Wrapper;
21154 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21155 GA->getValueType(0),
21156 GA->getOffset(), OperandFlags);
21157 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21158
21159 // Add x@dtpoff with the base.
21160 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
21161}
21162
21163// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
21164static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21165 const EVT PtrVT, TLSModel::Model model,
21166 bool is64Bit, bool isPIC) {
21167 SDLoc dl(GA);
21168
21169 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
21170 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
21171 is64Bit ? 257 : 256));
21172
21173 SDValue ThreadPointer =
21174 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
21175 MachinePointerInfo(Ptr));
21176
21177 unsigned char OperandFlags = 0;
21178 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
21179 // initialexec.
21180 unsigned WrapperKind = X86ISD::Wrapper;
21181 if (model == TLSModel::LocalExec) {
21182 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
21183 } else if (model == TLSModel::InitialExec) {
21184 if (is64Bit) {
21185 OperandFlags = X86II::MO_GOTTPOFF;
21186 WrapperKind = X86ISD::WrapperRIP;
21187 } else {
21188 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
21189 }
21190 } else {
21191 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21191)
;
21192 }
21193
21194 // emit "addl x@ntpoff,%eax" (local exec)
21195 // or "addl x@indntpoff,%eax" (initial exec)
21196 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
21197 SDValue TGA =
21198 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
21199 GA->getOffset(), OperandFlags);
21200 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21201
21202 if (model == TLSModel::InitialExec) {
21203 if (isPIC && !is64Bit) {
21204 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
21205 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21206 Offset);
21207 }
21208
21209 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
21210 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21211 }
21212
21213 // The address of the thread local variable is the add of the thread
21214 // pointer with the offset of the variable.
21215 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
21216}
21217
21218SDValue
21219X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
21220
21221 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
21222
21223 if (DAG.getTarget().useEmulatedTLS())
21224 return LowerToTLSEmulatedModel(GA, DAG);
21225
21226 const GlobalValue *GV = GA->getGlobal();
21227 auto PtrVT = getPointerTy(DAG.getDataLayout());
21228 bool PositionIndependent = isPositionIndependent();
21229
21230 if (Subtarget.isTargetELF()) {
21231 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
21232 switch (model) {
21233 case TLSModel::GeneralDynamic:
21234 if (Subtarget.is64Bit()) {
21235 if (Subtarget.isTarget64BitLP64())
21236 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
21237 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
21238 }
21239 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
21240 case TLSModel::LocalDynamic:
21241 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
21242 Subtarget.isTarget64BitLP64());
21243 case TLSModel::InitialExec:
21244 case TLSModel::LocalExec:
21245 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
21246 PositionIndependent);
21247 }
21248 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21248)
;
21249 }
21250
21251 if (Subtarget.isTargetDarwin()) {
21252 // Darwin only has one model of TLS. Lower to that.
21253 unsigned char OpFlag = 0;
21254 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
21255 X86ISD::WrapperRIP : X86ISD::Wrapper;
21256
21257 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
21258 // global base reg.
21259 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
21260 if (PIC32)
21261 OpFlag = X86II::MO_TLVP_PIC_BASE;
21262 else
21263 OpFlag = X86II::MO_TLVP;
21264 SDLoc DL(Op);
21265 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
21266 GA->getValueType(0),
21267 GA->getOffset(), OpFlag);
21268 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
21269
21270 // With PIC32, the address is actually $g + Offset.
21271 if (PIC32)
21272 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
21273 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21274 Offset);
21275
21276 // Lowering the machine isd will make sure everything is in the right
21277 // location.
21278 SDValue Chain = DAG.getEntryNode();
21279 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21280 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
21281 SDValue Args[] = { Chain, Offset };
21282 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
21283 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
21284
21285 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
21286 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21287 MFI.setAdjustsStack(true);
21288
21289 // And our return value (tls address) is in the standard call return value
21290 // location.
21291 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
21292 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
21293 }
21294
21295 if (Subtarget.isOSWindows()) {
21296 // Just use the implicit TLS architecture
21297 // Need to generate something similar to:
21298 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
21299 // ; from TEB
21300 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
21301 // mov rcx, qword [rdx+rcx*8]
21302 // mov eax, .tls$:tlsvar
21303 // [rax+rcx] contains the address
21304 // Windows 64bit: gs:0x58
21305 // Windows 32bit: fs:__tls_array
21306
21307 SDLoc dl(GA);
21308 SDValue Chain = DAG.getEntryNode();
21309
21310 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
21311 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
21312 // use its literal value of 0x2C.
21313 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
21314 ? Type::getInt8PtrTy(*DAG.getContext(),
21315 256)
21316 : Type::getInt32PtrTy(*DAG.getContext(),
21317 257));
21318
21319 SDValue TlsArray = Subtarget.is64Bit()
21320 ? DAG.getIntPtrConstant(0x58, dl)
21321 : (Subtarget.isTargetWindowsGNU()
21322 ? DAG.getIntPtrConstant(0x2C, dl)
21323 : DAG.getExternalSymbol("_tls_array", PtrVT));
21324
21325 SDValue ThreadPointer =
21326 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
21327
21328 SDValue res;
21329 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
21330 res = ThreadPointer;
21331 } else {
21332 // Load the _tls_index variable
21333 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
21334 if (Subtarget.is64Bit())
21335 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
21336 MachinePointerInfo(), MVT::i32);
21337 else
21338 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
21339
21340 const DataLayout &DL = DAG.getDataLayout();
21341 SDValue Scale =
21342 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
21343 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
21344
21345 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
21346 }
21347
21348 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
21349
21350 // Get the offset of start of .tls section
21351 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21352 GA->getValueType(0),
21353 GA->getOffset(), X86II::MO_SECREL);
21354 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
21355
21356 // The address of the thread local variable is the add of the thread
21357 // pointer with the offset of the variable.
21358 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
21359 }
21360
21361 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21361)
;
21362}
21363
21364/// Lower SRA_PARTS and friends, which return two i32 values
21365/// and take a 2 x i32 value to shift plus a shift amount.
21366/// TODO: Can this be moved to general expansion code?
21367static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
21368 SDValue Lo, Hi;
21369 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
21370 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
21371}
21372
21373// Try to use a packed vector operation to handle i64 on 32-bit targets when
21374// AVX512DQ is enabled.
21375static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
21376 const X86Subtarget &Subtarget) {
21377 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21378 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21379 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21380 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
21381 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))
;
21382 bool IsStrict = Op->isStrictFPOpcode();
21383 unsigned OpNo = IsStrict ? 1 : 0;
21384 SDValue Src = Op.getOperand(OpNo);
21385 MVT SrcVT = Src.getSimpleValueType();
21386 MVT VT = Op.getSimpleValueType();
21387
21388 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
21389 (VT != MVT::f32 && VT != MVT::f64))
21390 return SDValue();
21391
21392 // Pack the i64 into a vector, do the operation and extract.
21393
21394 // Using 256-bit to ensure result is 128-bits for f32 case.
21395 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
21396 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
21397 MVT VecVT = MVT::getVectorVT(VT, NumElts);
21398
21399 SDLoc dl(Op);
21400 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
21401 if (IsStrict) {
21402 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
21403 {Op.getOperand(0), InVec});
21404 SDValue Chain = CvtVec.getValue(1);
21405 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21406 DAG.getIntPtrConstant(0, dl));
21407 return DAG.getMergeValues({Value, Chain}, dl);
21408 }
21409
21410 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
21411
21412 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21413 DAG.getIntPtrConstant(0, dl));
21414}
21415
21416// Try to use a packed vector operation to handle i64 on 32-bit targets.
21417static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
21418 const X86Subtarget &Subtarget) {
21419 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21420 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21421 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21422 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
21423 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))
;
21424 bool IsStrict = Op->isStrictFPOpcode();
21425 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21426 MVT SrcVT = Src.getSimpleValueType();
21427 MVT VT = Op.getSimpleValueType();
21428
21429 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
21430 return SDValue();
21431
21432 // Pack the i64 into a vector, do the operation and extract.
21433
21434 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21434, __extension__
__PRETTY_FUNCTION__))
;
21435
21436 SDLoc dl(Op);
21437 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
21438 if (IsStrict) {
21439 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
21440 {Op.getOperand(0), InVec});
21441 SDValue Chain = CvtVec.getValue(1);
21442 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21443 DAG.getIntPtrConstant(0, dl));
21444 return DAG.getMergeValues({Value, Chain}, dl);
21445 }
21446
21447 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
21448
21449 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21450 DAG.getIntPtrConstant(0, dl));
21451}
21452
21453static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
21454 const X86Subtarget &Subtarget) {
21455 switch (Opcode) {
21456 case ISD::SINT_TO_FP:
21457 // TODO: Handle wider types with AVX/AVX512.
21458 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
21459 return false;
21460 // CVTDQ2PS or (V)CVTDQ2PD
21461 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
21462
21463 case ISD::UINT_TO_FP:
21464 // TODO: Handle wider types and i64 elements.
21465 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
21466 return false;
21467 // VCVTUDQ2PS or VCVTUDQ2PD
21468 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
21469
21470 default:
21471 return false;
21472 }
21473}
21474
21475/// Given a scalar cast operation that is extracted from a vector, try to
21476/// vectorize the cast op followed by extraction. This will avoid an expensive
21477/// round-trip between XMM and GPR.
21478static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
21479 const X86Subtarget &Subtarget) {
21480 // TODO: This could be enhanced to handle smaller integer types by peeking
21481 // through an extend.
21482 SDValue Extract = Cast.getOperand(0);
21483 MVT DestVT = Cast.getSimpleValueType();
21484 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21485 !isa<ConstantSDNode>(Extract.getOperand(1)))
21486 return SDValue();
21487
21488 // See if we have a 128-bit vector cast op for this type of cast.
21489 SDValue VecOp = Extract.getOperand(0);
21490 MVT FromVT = VecOp.getSimpleValueType();
21491 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
21492 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
21493 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
21494 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
21495 return SDValue();
21496
21497 // If we are extracting from a non-zero element, first shuffle the source
21498 // vector to allow extracting from element zero.
21499 SDLoc DL(Cast);
21500 if (!isNullConstant(Extract.getOperand(1))) {
21501 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
21502 Mask[0] = Extract.getConstantOperandVal(1);
21503 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
21504 }
21505 // If the source vector is wider than 128-bits, extract the low part. Do not
21506 // create an unnecessarily wide vector cast op.
21507 if (FromVT != Vec128VT)
21508 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
21509
21510 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
21511 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
21512 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
21513 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
21514 DAG.getIntPtrConstant(0, DL));
21515}
21516
21517/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
21518/// try to vectorize the cast ops. This will avoid an expensive round-trip
21519/// between XMM and GPR.
21520static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
21521 const X86Subtarget &Subtarget) {
21522 // TODO: Allow FP_TO_UINT.
21523 SDValue CastToInt = CastToFP.getOperand(0);
21524 MVT VT = CastToFP.getSimpleValueType();
21525 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
21526 return SDValue();
21527
21528 MVT IntVT = CastToInt.getSimpleValueType();
21529 SDValue X = CastToInt.getOperand(0);
21530 MVT SrcVT = X.getSimpleValueType();
21531 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
21532 return SDValue();
21533
21534 // See if we have 128-bit vector cast instructions for this type of cast.
21535 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
21536 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
21537 IntVT != MVT::i32)
21538 return SDValue();
21539
21540 unsigned SrcSize = SrcVT.getSizeInBits();
21541 unsigned IntSize = IntVT.getSizeInBits();
21542 unsigned VTSize = VT.getSizeInBits();
21543 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
21544 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
21545 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
21546
21547 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
21548 unsigned ToIntOpcode =
21549 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
21550 unsigned ToFPOpcode =
21551 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
21552
21553 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
21554 //
21555 // We are not defining the high elements (for example, zero them) because
21556 // that could nullify any performance advantage that we hoped to gain from
21557 // this vector op hack. We do not expect any adverse effects (like denorm
21558 // penalties) with cast ops.
21559 SDLoc DL(CastToFP);
21560 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
21561 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
21562 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
21563 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
21564 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
21565}
21566
21567static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
21568 const X86Subtarget &Subtarget) {
21569 SDLoc DL(Op);
21570 bool IsStrict = Op->isStrictFPOpcode();
21571 MVT VT = Op->getSimpleValueType(0);
21572 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
21573
21574 if (Subtarget.hasDQI()) {
21575 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21575, __extension__
__PRETTY_FUNCTION__))
;
21576
21577 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))
21578 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))
21579 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))
;
21580
21581 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
21582 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__
__PRETTY_FUNCTION__))
21583 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__
__PRETTY_FUNCTION__))
;
21584 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21585
21586 // Need to concat with zero vector for strict fp to avoid spurious
21587 // exceptions.
21588 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
21589 : DAG.getUNDEF(MVT::v8i64);
21590 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
21591 DAG.getIntPtrConstant(0, DL));
21592 SDValue Res, Chain;
21593 if (IsStrict) {
21594 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
21595 {Op->getOperand(0), Src});
21596 Chain = Res.getValue(1);
21597 } else {
21598 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
21599 }
21600
21601 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21602 DAG.getIntPtrConstant(0, DL));
21603
21604 if (IsStrict)
21605 return DAG.getMergeValues({Res, Chain}, DL);
21606 return Res;
21607 }
21608
21609 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
21610 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
21611 if (VT != MVT::v4f32 || IsSigned)
21612 return SDValue();
21613
21614 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
21615 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
21616 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
21617 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
21618 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
21619 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
21620 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
21621 SmallVector<SDValue, 4> SignCvts(4);
21622 SmallVector<SDValue, 4> Chains(4);
21623 for (int i = 0; i != 4; ++i) {
21624 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
21625 DAG.getIntPtrConstant(i, DL));
21626 if (IsStrict) {
21627 SignCvts[i] =
21628 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
21629 {Op.getOperand(0), Elt});
21630 Chains[i] = SignCvts[i].getValue(1);
21631 } else {
21632 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
21633 }
21634 }
21635 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
21636
21637 SDValue Slow, Chain;
21638 if (IsStrict) {
21639 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
21640 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
21641 {Chain, SignCvt, SignCvt});
21642 Chain = Slow.getValue(1);
21643 } else {
21644 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
21645 }
21646
21647 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
21648 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
21649
21650 if (IsStrict)
21651 return DAG.getMergeValues({Cvt, Chain}, DL);
21652
21653 return Cvt;
21654}
21655
21656static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
21657 bool IsStrict = Op->isStrictFPOpcode();
21658 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21659 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21660 MVT VT = Op.getSimpleValueType();
21661 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21662 SDLoc dl(Op);
21663
21664 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
21665 if (IsStrict)
21666 return DAG.getNode(
21667 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
21668 {Chain,
21669 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
21670 Rnd});
21671 return DAG.getNode(ISD::FP_ROUND, dl, VT,
21672 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
21673}
21674
21675static bool isLegalConversion(MVT VT, bool IsSigned,
21676 const X86Subtarget &Subtarget) {
21677 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
21678 return true;
21679 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
21680 return true;
21681 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
21682 return true;
21683 if (Subtarget.useAVX512Regs()) {
21684 if (VT == MVT::v16i32)
21685 return true;
21686 if (VT == MVT::v8i64 && Subtarget.hasDQI())
21687 return true;
21688 }
21689 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
21690 (VT == MVT::v2i64 || VT == MVT::v4i64))
21691 return true;
21692 return false;
21693}
21694
21695SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
21696 SelectionDAG &DAG) const {
21697 bool IsStrict = Op->isStrictFPOpcode();
21698 unsigned OpNo = IsStrict ? 1 : 0;
21699 SDValue Src = Op.getOperand(OpNo);
21700 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21701 MVT SrcVT = Src.getSimpleValueType();
21702 MVT VT = Op.getSimpleValueType();
21703 SDLoc dl(Op);
21704
21705 if (isSoftFP16(VT))
21706 return promoteXINT_TO_FP(Op, DAG);
21707 else if (isLegalConversion(SrcVT, true, Subtarget))
21708 return Op;
21709
21710 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21711 return LowerWin64_INT128_TO_FP(Op, DAG);
21712
21713 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21714 return Extract;
21715
21716 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
21717 return R;
21718
21719 if (SrcVT.isVector()) {
21720 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
21721 // Note: Since v2f64 is a legal type. We don't need to zero extend the
21722 // source for strict FP.
21723 if (IsStrict)
21724 return DAG.getNode(
21725 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
21726 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21727 DAG.getUNDEF(SrcVT))});
21728 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
21729 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21730 DAG.getUNDEF(SrcVT)));
21731 }
21732 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
21733 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21734
21735 return SDValue();
21736 }
21737
21738 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__
__PRETTY_FUNCTION__))
21739 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__
__PRETTY_FUNCTION__))
;
21740
21741 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
21742
21743 // These are really Legal; return the operand so the caller accepts it as
21744 // Legal.
21745 if (SrcVT == MVT::i32 && UseSSEReg)
21746 return Op;
21747 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
21748 return Op;
21749
21750 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21751 return V;
21752 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21753 return V;
21754
21755 // SSE doesn't have an i16 conversion so we need to promote.
21756 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
21757 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
21758 if (IsStrict)
21759 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
21760 {Chain, Ext});
21761
21762 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
21763 }
21764
21765 if (VT == MVT::f128 || !Subtarget.hasX87())
21766 return SDValue();
21767
21768 SDValue ValueToStore = Src;
21769 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
21770 // Bitcasting to f64 here allows us to do a single 64-bit store from
21771 // an SSE register, avoiding the store forwarding penalty that would come
21772 // with two 32-bit stores.
21773 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21774
21775 unsigned Size = SrcVT.getStoreSize();
21776 Align Alignment(Size);
21777 MachineFunction &MF = DAG.getMachineFunction();
21778 auto PtrVT = getPointerTy(MF.getDataLayout());
21779 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
21780 MachinePointerInfo MPI =
21781 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21782 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21783 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
21784 std::pair<SDValue, SDValue> Tmp =
21785 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
21786
21787 if (IsStrict)
21788 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21789
21790 return Tmp.first;
21791}
21792
21793std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
21794 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
21795 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
21796 // Build the FILD
21797 SDVTList Tys;
21798 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
21799 if (useSSE)
21800 Tys = DAG.getVTList(MVT::f80, MVT::Other);
21801 else
21802 Tys = DAG.getVTList(DstVT, MVT::Other);
21803
21804 SDValue FILDOps[] = {Chain, Pointer};
21805 SDValue Result =
21806 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
21807 Alignment, MachineMemOperand::MOLoad);
21808 Chain = Result.getValue(1);
21809
21810 if (useSSE) {
21811 MachineFunction &MF = DAG.getMachineFunction();
21812 unsigned SSFISize = DstVT.getStoreSize();
21813 int SSFI =
21814 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
21815 auto PtrVT = getPointerTy(MF.getDataLayout());
21816 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21817 Tys = DAG.getVTList(MVT::Other);
21818 SDValue FSTOps[] = {Chain, Result, StackSlot};
21819 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
21820 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
21821 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
21822
21823 Chain =
21824 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
21825 Result = DAG.getLoad(
21826 DstVT, DL, Chain, StackSlot,
21827 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
21828 Chain = Result.getValue(1);
21829 }
21830
21831 return { Result, Chain };
21832}
21833
21834/// Horizontal vector math instructions may be slower than normal math with
21835/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
21836/// implementation, and likely shuffle complexity of the alternate sequence.
21837static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
21838 const X86Subtarget &Subtarget) {
21839 bool IsOptimizingSize = DAG.shouldOptForSize();
21840 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
21841 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
21842}
21843
21844/// 64-bit unsigned integer to double expansion.
21845static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
21846 const X86Subtarget &Subtarget) {
21847 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
21848 // when converting 0 when rounding toward negative infinity. Caller will
21849 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
21850 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21850, __extension__
__PRETTY_FUNCTION__))
;
21851 // This algorithm is not obvious. Here it is what we're trying to output:
21852 /*
21853 movq %rax, %xmm0
21854 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
21855 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
21856 #ifdef __SSE3__
21857 haddpd %xmm0, %xmm0
21858 #else
21859 pshufd $0x4e, %xmm0, %xmm1
21860 addpd %xmm1, %xmm0
21861 #endif
21862 */
21863
21864 SDLoc dl(Op);
21865 LLVMContext *Context = DAG.getContext();
21866
21867 // Build some magic constants.
21868 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21869 Constant *C0 = ConstantDataVector::get(*Context, CV0);
21870 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21871 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
21872
21873 SmallVector<Constant*,2> CV1;
21874 CV1.push_back(
21875 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21876 APInt(64, 0x4330000000000000ULL))));
21877 CV1.push_back(
21878 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21879 APInt(64, 0x4530000000000000ULL))));
21880 Constant *C1 = ConstantVector::get(CV1);
21881 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
21882
21883 // Load the 64-bit value into an XMM register.
21884 SDValue XR1 =
21885 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
21886 SDValue CLod0 = DAG.getLoad(
21887 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
21888 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21889 SDValue Unpck1 =
21890 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
21891
21892 SDValue CLod1 = DAG.getLoad(
21893 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
21894 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21895 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
21896 // TODO: Are there any fast-math-flags to propagate here?
21897 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
21898 SDValue Result;
21899
21900 if (Subtarget.hasSSE3() &&
21901 shouldUseHorizontalOp(true, DAG, Subtarget)) {
21902 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
21903 } else {
21904 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
21905 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
21906 }
21907 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
21908 DAG.getIntPtrConstant(0, dl));
21909 return Result;
21910}
21911
21912/// 32-bit unsigned integer to float expansion.
21913static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
21914 const X86Subtarget &Subtarget) {
21915 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21916 SDLoc dl(Op);
21917 // FP constant to bias correct the final result.
21918 SDValue Bias = DAG.getConstantFP(
21919 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
21920
21921 // Load the 32-bit value into an XMM register.
21922 SDValue Load =
21923 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
21924
21925 // Zero out the upper parts of the register.
21926 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
21927
21928 // Or the load with the bias.
21929 SDValue Or = DAG.getNode(
21930 ISD::OR, dl, MVT::v2i64,
21931 DAG.getBitcast(MVT::v2i64, Load),
21932 DAG.getBitcast(MVT::v2i64,
21933 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
21934 Or =
21935 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
21936 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
21937
21938 if (Op.getNode()->isStrictFPOpcode()) {
21939 // Subtract the bias.
21940 // TODO: Are there any fast-math-flags to propagate here?
21941 SDValue Chain = Op.getOperand(0);
21942 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
21943 {Chain, Or, Bias});
21944
21945 if (Op.getValueType() == Sub.getValueType())
21946 return Sub;
21947
21948 // Handle final rounding.
21949 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
21950 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
21951
21952 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
21953 }
21954
21955 // Subtract the bias.
21956 // TODO: Are there any fast-math-flags to propagate here?
21957 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21958
21959 // Handle final rounding.
21960 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21961}
21962
21963static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
21964 const X86Subtarget &Subtarget,
21965 const SDLoc &DL) {
21966 if (Op.getSimpleValueType() != MVT::v2f64)
21967 return SDValue();
21968
21969 bool IsStrict = Op->isStrictFPOpcode();
21970
21971 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21972 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21972, __extension__
__PRETTY_FUNCTION__))
;
21973
21974 if (Subtarget.hasAVX512()) {
21975 if (!Subtarget.hasVLX()) {
21976 // Let generic type legalization widen this.
21977 if (!IsStrict)
21978 return SDValue();
21979 // Otherwise pad the integer input with 0s and widen the operation.
21980 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21981 DAG.getConstant(0, DL, MVT::v2i32));
21982 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21983 {Op.getOperand(0), N0});
21984 SDValue Chain = Res.getValue(1);
21985 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21986 DAG.getIntPtrConstant(0, DL));
21987 return DAG.getMergeValues({Res, Chain}, DL);
21988 }
21989
21990 // Legalize to v4i32 type.
21991 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21992 DAG.getUNDEF(MVT::v2i32));
21993 if (IsStrict)
21994 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21995 {Op.getOperand(0), N0});
21996 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21997 }
21998
21999 // Zero extend to 2i64, OR with the floating point representation of 2^52.
22000 // This gives us the floating point equivalent of 2^52 + the i32 integer
22001 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
22002 // point leaving just our i32 integers in double format.
22003 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
22004 SDValue VBias = DAG.getConstantFP(
22005 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
22006 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
22007 DAG.getBitcast(MVT::v2i64, VBias));
22008 Or = DAG.getBitcast(MVT::v2f64, Or);
22009
22010 if (IsStrict)
22011 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
22012 {Op.getOperand(0), Or, VBias});
22013 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
22014}
22015
22016static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
22017 const X86Subtarget &Subtarget) {
22018 SDLoc DL(Op);
22019 bool IsStrict = Op->isStrictFPOpcode();
22020 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
22021 MVT VecIntVT = V.getSimpleValueType();
22022 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__
__PRETTY_FUNCTION__))
22023 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__
__PRETTY_FUNCTION__))
;
22024
22025 if (Subtarget.hasAVX512()) {
22026 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
22027 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22027, __extension__
__PRETTY_FUNCTION__))
;
22028 MVT VT = Op->getSimpleValueType(0);
22029
22030 // v8i32->v8f64 is legal with AVX512 so just return it.
22031 if (VT == MVT::v8f64)
22032 return Op;
22033
22034 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__
__PRETTY_FUNCTION__))
22035 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__
__PRETTY_FUNCTION__))
;
22036 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
22037 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
22038 // Need to concat with zero vector for strict fp to avoid spurious
22039 // exceptions.
22040 SDValue Tmp =
22041 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
22042 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
22043 DAG.getIntPtrConstant(0, DL));
22044 SDValue Res, Chain;
22045 if (IsStrict) {
22046 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
22047 {Op->getOperand(0), V});
22048 Chain = Res.getValue(1);
22049 } else {
22050 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
22051 }
22052
22053 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
22054 DAG.getIntPtrConstant(0, DL));
22055
22056 if (IsStrict)
22057 return DAG.getMergeValues({Res, Chain}, DL);
22058 return Res;
22059 }
22060
22061 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
22062 Op->getSimpleValueType(0) == MVT::v4f64) {
22063 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
22064 Constant *Bias = ConstantFP::get(
22065 *DAG.getContext(),
22066 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
22067 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
22068 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
22069 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
22070 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
22071 SDValue VBias = DAG.getMemIntrinsicNode(
22072 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
22073 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
22074 MachineMemOperand::MOLoad);
22075
22076 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
22077 DAG.getBitcast(MVT::v4i64, VBias));
22078 Or = DAG.getBitcast(MVT::v4f64, Or);
22079
22080 if (IsStrict)
22081 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
22082 {Op.getOperand(0), Or, VBias});
22083 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
22084 }
22085
22086 // The algorithm is the following:
22087 // #ifdef __SSE4_1__
22088 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22089 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22090 // (uint4) 0x53000000, 0xaa);
22091 // #else
22092 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22093 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
22094 // #endif
22095 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22096 // return (float4) lo + fhi;
22097
22098 bool Is128 = VecIntVT == MVT::v4i32;
22099 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
22100 // If we convert to something else than the supported type, e.g., to v4f64,
22101 // abort early.
22102 if (VecFloatVT != Op->getSimpleValueType(0))
22103 return SDValue();
22104
22105 // In the #idef/#else code, we have in common:
22106 // - The vector of constants:
22107 // -- 0x4b000000
22108 // -- 0x53000000
22109 // - A shift:
22110 // -- v >> 16
22111
22112 // Create the splat vector for 0x4b000000.
22113 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
22114 // Create the splat vector for 0x53000000.
22115 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
22116
22117 // Create the right shift.
22118 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
22119 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
22120
22121 SDValue Low, High;
22122 if (Subtarget.hasSSE41()) {
22123 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
22124 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22125 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
22126 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
22127 // Low will be bitcasted right away, so do not bother bitcasting back to its
22128 // original type.
22129 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
22130 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22131 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22132 // (uint4) 0x53000000, 0xaa);
22133 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
22134 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
22135 // High will be bitcasted right away, so do not bother bitcasting back to
22136 // its original type.
22137 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
22138 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22139 } else {
22140 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
22141 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22142 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
22143 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
22144
22145 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
22146 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
22147 }
22148
22149 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
22150 SDValue VecCstFSub = DAG.getConstantFP(
22151 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
22152
22153 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22154 // NOTE: By using fsub of a positive constant instead of fadd of a negative
22155 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
22156 // enabled. See PR24512.
22157 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
22158 // TODO: Are there any fast-math-flags to propagate here?
22159 // (float4) lo;
22160 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
22161 // return (float4) lo + fhi;
22162 if (IsStrict) {
22163 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
22164 {Op.getOperand(0), HighBitcast, VecCstFSub});
22165 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
22166 {FHigh.getValue(1), LowBitcast, FHigh});
22167 }
22168
22169 SDValue FHigh =
22170 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
22171 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
22172}
22173
22174static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
22175 const X86Subtarget &Subtarget) {
22176 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
22177 SDValue N0 = Op.getOperand(OpNo);
22178 MVT SrcVT = N0.getSimpleValueType();
22179 SDLoc dl(Op);
22180
22181 switch (SrcVT.SimpleTy) {
22182 default:
22183 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22183)
;
22184 case MVT::v2i32:
22185 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
22186 case MVT::v4i32:
22187 case MVT::v8i32:
22188 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
22189 case MVT::v2i64:
22190 case MVT::v4i64:
22191 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
22192 }
22193}
22194
22195SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
22196 SelectionDAG &DAG) const {
22197 bool IsStrict = Op->isStrictFPOpcode();
22198 unsigned OpNo = IsStrict ? 1 : 0;
22199 SDValue Src = Op.getOperand(OpNo);
22200 SDLoc dl(Op);
22201 auto PtrVT = getPointerTy(DAG.getDataLayout());
22202 MVT SrcVT = Src.getSimpleValueType();
22203 MVT DstVT = Op->getSimpleValueType(0);
22204 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22205
22206 // Bail out when we don't have native conversion instructions.
22207 if (DstVT == MVT::f128)
22208 return SDValue();
22209
22210 if (isSoftFP16(DstVT))
22211 return promoteXINT_TO_FP(Op, DAG);
22212 else if (isLegalConversion(SrcVT, false, Subtarget))
22213 return Op;
22214
22215 if (DstVT.isVector())
22216 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
22217
22218 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
22219 return LowerWin64_INT128_TO_FP(Op, DAG);
22220
22221 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
22222 return Extract;
22223
22224 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
22225 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
22226 // Conversions from unsigned i32 to f32/f64 are legal,
22227 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
22228 return Op;
22229 }
22230
22231 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
22232 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
22233 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
22234 if (IsStrict)
22235 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
22236 {Chain, Src});
22237 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
22238 }
22239
22240 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
22241 return V;
22242 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
22243 return V;
22244
22245 // The transform for i64->f64 isn't correct for 0 when rounding to negative
22246 // infinity. It produces -0.0, so disable under strictfp.
22247 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
22248 !IsStrict)
22249 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
22250 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
22251 // negative infinity. So disable under strictfp. Using FILD instead.
22252 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
22253 !IsStrict)
22254 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
22255 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
22256 (DstVT == MVT::f32 || DstVT == MVT::f64))
22257 return SDValue();
22258
22259 // Make a 64-bit buffer, and use it to build an FILD.
22260 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
22261 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
22262 Align SlotAlign(8);
22263 MachinePointerInfo MPI =
22264 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
22265 if (SrcVT == MVT::i32) {
22266 SDValue OffsetSlot =
22267 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
22268 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
22269 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
22270 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
22271 std::pair<SDValue, SDValue> Tmp =
22272 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
22273 if (IsStrict)
22274 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
22275
22276 return Tmp.first;
22277 }
22278
22279 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22279, __extension__
__PRETTY_FUNCTION__))
;
22280 SDValue ValueToStore = Src;
22281 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
22282 // Bitcasting to f64 here allows us to do a single 64-bit store from
22283 // an SSE register, avoiding the store forwarding penalty that would come
22284 // with two 32-bit stores.
22285 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
22286 }
22287 SDValue Store =
22288 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
22289 // For i64 source, we need to add the appropriate power of 2 if the input
22290 // was negative. We must be careful to do the computation in x87 extended
22291 // precision, not in SSE.
22292 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22293 SDValue Ops[] = { Store, StackSlot };
22294 SDValue Fild =
22295 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
22296 SlotAlign, MachineMemOperand::MOLoad);
22297 Chain = Fild.getValue(1);
22298
22299
22300 // Check whether the sign bit is set.
22301 SDValue SignSet = DAG.getSetCC(
22302 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
22303 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
22304
22305 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
22306 APInt FF(64, 0x5F80000000000000ULL);
22307 SDValue FudgePtr = DAG.getConstantPool(
22308 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
22309 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
22310
22311 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
22312 SDValue Zero = DAG.getIntPtrConstant(0, dl);
22313 SDValue Four = DAG.getIntPtrConstant(4, dl);
22314 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
22315 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
22316
22317 // Load the value out, extending it from f32 to f80.
22318 SDValue Fudge = DAG.getExtLoad(
22319 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
22320 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
22321 CPAlignment);
22322 Chain = Fudge.getValue(1);
22323 // Extend everything to 80 bits to force it to be done on x87.
22324 // TODO: Are there any fast-math-flags to propagate here?
22325 if (IsStrict) {
22326 unsigned Opc = ISD::STRICT_FADD;
22327 // Windows needs the precision control changed to 80bits around this add.
22328 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22329 Opc = X86ISD::STRICT_FP80_ADD;
22330
22331 SDValue Add =
22332 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
22333 // STRICT_FP_ROUND can't handle equal types.
22334 if (DstVT == MVT::f80)
22335 return Add;
22336 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
22337 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
22338 }
22339 unsigned Opc = ISD::FADD;
22340 // Windows needs the precision control changed to 80bits around this add.
22341 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22342 Opc = X86ISD::FP80_ADD;
22343
22344 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
22345 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
22346 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22347}
22348
22349// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
22350// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
22351// just return an SDValue().
22352// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
22353// to i16, i32 or i64, and we lower it to a legal sequence and return the
22354// result.
22355SDValue
22356X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
22357 bool IsSigned, SDValue &Chain) const {
22358 bool IsStrict = Op->isStrictFPOpcode();
22359 SDLoc DL(Op);
22360
22361 EVT DstTy = Op.getValueType();
22362 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
22363 EVT TheVT = Value.getValueType();
22364 auto PtrVT = getPointerTy(DAG.getDataLayout());
22365
22366 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
22367 // f16 must be promoted before using the lowering in this routine.
22368 // fp128 does not use this lowering.
22369 return SDValue();
22370 }
22371
22372 // If using FIST to compute an unsigned i64, we'll need some fixup
22373 // to handle values above the maximum signed i64. A FIST is always
22374 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
22375 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
22376
22377 // FIXME: This does not generate an invalid exception if the input does not
22378 // fit in i32. PR44019
22379 if (!IsSigned && DstTy != MVT::i64) {
22380 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
22381 // The low 32 bits of the fist result will have the correct uint32 result.
22382 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22382, __extension__
__PRETTY_FUNCTION__))
;
22383 DstTy = MVT::i64;
22384 }
22385
22386 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))
22387 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))
22388 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))
;
22389
22390 // We lower FP->int64 into FISTP64 followed by a load from a temporary
22391 // stack slot.
22392 MachineFunction &MF = DAG.getMachineFunction();
22393 unsigned MemSize = DstTy.getStoreSize();
22394 int SSFI =
22395 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
22396 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
22397
22398 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22399
22400 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
22401
22402 if (UnsignedFixup) {
22403 //
22404 // Conversion to unsigned i64 is implemented with a select,
22405 // depending on whether the source value fits in the range
22406 // of a signed i64. Let Thresh be the FP equivalent of
22407 // 0x8000000000000000ULL.
22408 //
22409 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
22410 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
22411 // FistSrc = (Value - FltOfs);
22412 // Fist-to-mem64 FistSrc
22413 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
22414 // to XOR'ing the high 32 bits with Adjust.
22415 //
22416 // Being a power of 2, Thresh is exactly representable in all FP formats.
22417 // For X87 we'd like to use the smallest FP type for this constant, but
22418 // for DAG type consistency we have to match the FP operand type.
22419
22420 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
22421 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
22422 bool LosesInfo = false;
22423 if (TheVT == MVT::f64)
22424 // The rounding mode is irrelevant as the conversion should be exact.
22425 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
22426 &LosesInfo);
22427 else if (TheVT == MVT::f80)
22428 Status = Thresh.convert(APFloat::x87DoubleExtended(),
22429 APFloat::rmNearestTiesToEven, &LosesInfo);
22430
22431 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__
__PRETTY_FUNCTION__))
22432 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__
__PRETTY_FUNCTION__))
;
22433
22434 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
22435
22436 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
22437 *DAG.getContext(), TheVT);
22438 SDValue Cmp;
22439 if (IsStrict) {
22440 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
22441 /*IsSignaling*/ true);
22442 Chain = Cmp.getValue(1);
22443 } else {
22444 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
22445 }
22446
22447 // Our preferred lowering of
22448 //
22449 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
22450 //
22451 // is
22452 //
22453 // (Value >= Thresh) << 63
22454 //
22455 // but since we can get here after LegalOperations, DAGCombine might do the
22456 // wrong thing if we create a select. So, directly create the preferred
22457 // version.
22458 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
22459 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
22460 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
22461
22462 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
22463 DAG.getConstantFP(0.0, DL, TheVT));
22464
22465 if (IsStrict) {
22466 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
22467 { Chain, Value, FltOfs });
22468 Chain = Value.getValue(1);
22469 } else
22470 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
22471 }
22472
22473 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
22474
22475 // FIXME This causes a redundant load/store if the SSE-class value is already
22476 // in memory, such as if it is on the callstack.
22477 if (isScalarFPTypeInSSEReg(TheVT)) {
22478 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22478, __extension__
__PRETTY_FUNCTION__))
;
22479 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
22480 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22481 SDValue Ops[] = { Chain, StackSlot };
22482
22483 unsigned FLDSize = TheVT.getStoreSize();
22484 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22484, __extension__
__PRETTY_FUNCTION__))
;
22485 MachineMemOperand *MMO = MF.getMachineMemOperand(
22486 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
22487 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
22488 Chain = Value.getValue(1);
22489 }
22490
22491 // Build the FP_TO_INT*_IN_MEM
22492 MachineMemOperand *MMO = MF.getMachineMemOperand(
22493 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
22494 SDValue Ops[] = { Chain, Value, StackSlot };
22495 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
22496 DAG.getVTList(MVT::Other),
22497 Ops, DstTy, MMO);
22498
22499 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
22500 Chain = Res.getValue(1);
22501
22502 // If we need an unsigned fixup, XOR the result with adjust.
22503 if (UnsignedFixup)
22504 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
22505
22506 return Res;
22507}
22508
22509static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
22510 const X86Subtarget &Subtarget) {
22511 MVT VT = Op.getSimpleValueType();
22512 SDValue In = Op.getOperand(0);
22513 MVT InVT = In.getSimpleValueType();
22514 SDLoc dl(Op);
22515 unsigned Opc = Op.getOpcode();
22516
22517 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22517, __extension__
__PRETTY_FUNCTION__))
;
22518 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__
__PRETTY_FUNCTION__))
22519 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__
__PRETTY_FUNCTION__))
;
22520 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__
__PRETTY_FUNCTION__))
22521 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__
__PRETTY_FUNCTION__))
;
22522 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
22523 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
22524 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
22525 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))
;
22526 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
22527 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
22528 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
22529 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))
;
22530
22531 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
22532
22533 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
22534 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22534, __extension__
__PRETTY_FUNCTION__))
;
22535 return splitVectorIntUnary(Op, DAG);
22536 }
22537
22538 if (Subtarget.hasInt256())
22539 return Op;
22540
22541 // Optimize vectors in AVX mode:
22542 //
22543 // v8i16 -> v8i32
22544 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
22545 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
22546 // Concat upper and lower parts.
22547 //
22548 // v4i32 -> v4i64
22549 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
22550 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
22551 // Concat upper and lower parts.
22552 //
22553 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22554 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
22555
22556 // Short-circuit if we can determine that each 128-bit half is the same value.
22557 // Otherwise, this is difficult to match and optimize.
22558 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
22559 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
22560 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
22561
22562 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
22563 SDValue Undef = DAG.getUNDEF(InVT);
22564 bool NeedZero = Opc == ISD::ZERO_EXTEND;
22565 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
22566 OpHi = DAG.getBitcast(HalfVT, OpHi);
22567
22568 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22569}
22570
22571// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
22572static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
22573 const SDLoc &dl, SelectionDAG &DAG) {
22574 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22574, __extension__
__PRETTY_FUNCTION__))
;
22575 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22576 DAG.getIntPtrConstant(0, dl));
22577 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22578 DAG.getIntPtrConstant(8, dl));
22579 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
22580 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
22581 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
22582 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22583}
22584
22585static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
22586 const X86Subtarget &Subtarget,
22587 SelectionDAG &DAG) {
22588 MVT VT = Op->getSimpleValueType(0);
22589 SDValue In = Op->getOperand(0);
22590 MVT InVT = In.getSimpleValueType();
22591 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22591, __extension__
__PRETTY_FUNCTION__))
;
22592 SDLoc DL(Op);
22593 unsigned NumElts = VT.getVectorNumElements();
22594
22595 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
22596 // avoids a constant pool load.
22597 if (VT.getVectorElementType() != MVT::i8) {
22598 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
22599 return DAG.getNode(ISD::SRL, DL, VT, Extend,
22600 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
22601 }
22602
22603 // Extend VT if BWI is not supported.
22604 MVT ExtVT = VT;
22605 if (!Subtarget.hasBWI()) {
22606 // If v16i32 is to be avoided, we'll need to split and concatenate.
22607 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22608 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
22609
22610 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22611 }
22612
22613 // Widen to 512-bits if VLX is not supported.
22614 MVT WideVT = ExtVT;
22615 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22616 NumElts *= 512 / ExtVT.getSizeInBits();
22617 InVT = MVT::getVectorVT(MVT::i1, NumElts);
22618 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
22619 In, DAG.getIntPtrConstant(0, DL));
22620 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
22621 NumElts);
22622 }
22623
22624 SDValue One = DAG.getConstant(1, DL, WideVT);
22625 SDValue Zero = DAG.getConstant(0, DL, WideVT);
22626
22627 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
22628
22629 // Truncate if we had to extend above.
22630 if (VT != ExtVT) {
22631 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
22632 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
22633 }
22634
22635 // Extract back to 128/256-bit if we widened.
22636 if (WideVT != VT)
22637 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
22638 DAG.getIntPtrConstant(0, DL));
22639
22640 return SelectedVal;
22641}
22642
22643static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22644 SelectionDAG &DAG) {
22645 SDValue In = Op.getOperand(0);
22646 MVT SVT = In.getSimpleValueType();
22647
22648 if (SVT.getVectorElementType() == MVT::i1)
22649 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
22650
22651 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22651, __extension__
__PRETTY_FUNCTION__))
;
22652 return LowerAVXExtend(Op, DAG, Subtarget);
22653}
22654
22655/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
22656/// It makes use of the fact that vectors with enough leading sign/zero bits
22657/// prevent the PACKSS/PACKUS from saturating the results.
22658/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
22659/// within each 128-bit lane.
22660static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
22661 const SDLoc &DL, SelectionDAG &DAG,
22662 const X86Subtarget &Subtarget) {
22663 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__
__PRETTY_FUNCTION__))
22664 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__
__PRETTY_FUNCTION__))
;
22665 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22665, __extension__
__PRETTY_FUNCTION__))
;
22666
22667 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
22668 if (!Subtarget.hasSSE2())
22669 return SDValue();
22670
22671 EVT SrcVT = In.getValueType();
22672
22673 // No truncation required, we might get here due to recursive calls.
22674 if (SrcVT == DstVT)
22675 return In;
22676
22677 // We only support vector truncation to 64bits or greater from a
22678 // 128bits or greater source.
22679 unsigned DstSizeInBits = DstVT.getSizeInBits();
22680 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
22681 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
22682 return SDValue();
22683
22684 unsigned NumElems = SrcVT.getVectorNumElements();
22685 if (!isPowerOf2_32(NumElems))
22686 return SDValue();
22687
22688 LLVMContext &Ctx = *DAG.getContext();
22689 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22689, __extension__
__PRETTY_FUNCTION__))
;
22690 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22690, __extension__
__PRETTY_FUNCTION__))
;
22691
22692 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
22693
22694 // Pack to the largest type possible:
22695 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
22696 EVT InVT = MVT::i16, OutVT = MVT::i8;
22697 if (SrcVT.getScalarSizeInBits() > 16 &&
22698 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
22699 InVT = MVT::i32;
22700 OutVT = MVT::i16;
22701 }
22702
22703 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
22704 if (SrcVT.is128BitVector()) {
22705 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
22706 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
22707 In = DAG.getBitcast(InVT, In);
22708 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
22709 Res = extractSubVector(Res, 0, DAG, DL, 64);
22710 return DAG.getBitcast(DstVT, Res);
22711 }
22712
22713 // Split lower/upper subvectors.
22714 SDValue Lo, Hi;
22715 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
22716
22717 unsigned SubSizeInBits = SrcSizeInBits / 2;
22718 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
22719 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
22720
22721 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
22722 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
22723 Lo = DAG.getBitcast(InVT, Lo);
22724 Hi = DAG.getBitcast(InVT, Hi);
22725 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22726 return DAG.getBitcast(DstVT, Res);
22727 }
22728
22729 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
22730 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
22731 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
22732 Lo = DAG.getBitcast(InVT, Lo);
22733 Hi = DAG.getBitcast(InVT, Hi);
22734 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22735
22736 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
22737 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
22738 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
22739 SmallVector<int, 64> Mask;
22740 int Scale = 64 / OutVT.getScalarSizeInBits();
22741 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
22742 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
22743
22744 if (DstVT.is256BitVector())
22745 return DAG.getBitcast(DstVT, Res);
22746
22747 // If 512bit -> 128bit truncate another stage.
22748 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22749 Res = DAG.getBitcast(PackedVT, Res);
22750 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22751 }
22752
22753 // Recursively pack lower/upper subvectors, concat result and pack again.
22754 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22754, __extension__
__PRETTY_FUNCTION__))
;
22755 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
22756 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
22757 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
22758
22759 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22760 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
22761 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22762}
22763
22764static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
22765 const X86Subtarget &Subtarget) {
22766
22767 SDLoc DL(Op);
22768 MVT VT = Op.getSimpleValueType();
22769 SDValue In = Op.getOperand(0);
22770 MVT InVT = In.getSimpleValueType();
22771
22772 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22772, __extension__
__PRETTY_FUNCTION__))
;
22773
22774 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
22775 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
22776 if (InVT.getScalarSizeInBits() <= 16) {
22777 if (Subtarget.hasBWI()) {
22778 // legal, will go to VPMOVB2M, VPMOVW2M
22779 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22780 // We need to shift to get the lsb into sign position.
22781 // Shift packed bytes not supported natively, bitcast to word
22782 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
22783 In = DAG.getNode(ISD::SHL, DL, ExtVT,
22784 DAG.getBitcast(ExtVT, In),
22785 DAG.getConstant(ShiftInx, DL, ExtVT));
22786 In = DAG.getBitcast(InVT, In);
22787 }
22788 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
22789 In, ISD::SETGT);
22790 }
22791 // Use TESTD/Q, extended vector to packed dword/qword.
22792 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__
__PRETTY_FUNCTION__))
22793 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__
__PRETTY_FUNCTION__))
;
22794 unsigned NumElts = InVT.getVectorNumElements();
22795 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__))
;
22796 // We need to change to a wider element type that we have support for.
22797 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
22798 // For 16 element vectors we extend to v16i32 unless we are explicitly
22799 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
22800 // we need to split into two 8 element vectors which we can extend to v8i32,
22801 // truncate and concat the results. There's an additional complication if
22802 // the original type is v16i8. In that case we can't split the v16i8
22803 // directly, so we need to shuffle high elements to low and use
22804 // sign_extend_vector_inreg.
22805 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
22806 SDValue Lo, Hi;
22807 if (InVT == MVT::v16i8) {
22808 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
22809 Hi = DAG.getVectorShuffle(
22810 InVT, DL, In, In,
22811 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
22812 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
22813 } else {
22814 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22814, __extension__
__PRETTY_FUNCTION__))
;
22815 Lo = extract128BitVector(In, 0, DAG, DL);
22816 Hi = extract128BitVector(In, 8, DAG, DL);
22817 }
22818 // We're split now, just emit two truncates and a concat. The two
22819 // truncates will trigger legalization to come back to this function.
22820 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
22821 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
22822 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22823 }
22824 // We either have 8 elements or we're allowed to use 512-bit vectors.
22825 // If we have VLX, we want to use the narrowest vector that can get the
22826 // job done so we use vXi32.
22827 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
22828 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
22829 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
22830 InVT = ExtVT;
22831 ShiftInx = InVT.getScalarSizeInBits() - 1;
22832 }
22833
22834 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22835 // We need to shift to get the lsb into sign position.
22836 In = DAG.getNode(ISD::SHL, DL, InVT, In,
22837 DAG.getConstant(ShiftInx, DL, InVT));
22838 }
22839 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
22840 if (Subtarget.hasDQI())
22841 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
22842 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
22843}
22844
22845SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
22846 SDLoc DL(Op);
22847 MVT VT = Op.getSimpleValueType();
22848 SDValue In = Op.getOperand(0);
22849 MVT InVT = In.getSimpleValueType();
22850 unsigned InNumEltBits = InVT.getScalarSizeInBits();
22851
22852 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__
__PRETTY_FUNCTION__))
22853 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__
__PRETTY_FUNCTION__))
;
22854
22855 // If we're called by the type legalizer, handle a few cases.
22856 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22857 if (!TLI.isTypeLegal(InVT)) {
22858 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
22859 VT.is128BitVector()) {
22860 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__
__PRETTY_FUNCTION__))
22861 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__
__PRETTY_FUNCTION__))
;
22862 // The default behavior is to truncate one step, concatenate, and then
22863 // truncate the remainder. We'd rather produce two 64-bit results and
22864 // concatenate those.
22865 SDValue Lo, Hi;
22866 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
22867
22868 EVT LoVT, HiVT;
22869 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22870
22871 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
22872 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
22873 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22874 }
22875
22876 // Otherwise let default legalization handle it.
22877 return SDValue();
22878 }
22879
22880 if (VT.getVectorElementType() == MVT::i1)
22881 return LowerTruncateVecI1(Op, DAG, Subtarget);
22882
22883 // vpmovqb/w/d, vpmovdb/w, vpmovwb
22884 if (Subtarget.hasAVX512()) {
22885 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
22886 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22886, __extension__
__PRETTY_FUNCTION__))
;
22887 return splitVectorIntUnary(Op, DAG);
22888 }
22889
22890 // word to byte only under BWI. Otherwise we have to promoted to v16i32
22891 // and then truncate that. But we should only do that if we haven't been
22892 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
22893 // handled by isel patterns.
22894 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
22895 Subtarget.canExtendTo512DQ())
22896 return Op;
22897 }
22898
22899 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
22900 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22901
22902 // Truncate with PACKUS if we are truncating a vector with leading zero bits
22903 // that extend all the way to the packed/truncated value.
22904 // Pre-SSE41 we can only use PACKUSWB.
22905 KnownBits Known = DAG.computeKnownBits(In);
22906 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
22907 if (SDValue V =
22908 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
22909 return V;
22910
22911 // Truncate with PACKSS if we are truncating a vector with sign-bits that
22912 // extend all the way to the packed/truncated value.
22913 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
22914 if (SDValue V =
22915 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
22916 return V;
22917
22918 // Handle truncation of V256 to V128 using shuffles.
22919 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22919, __extension__
__PRETTY_FUNCTION__))
;
22920
22921 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
22922 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
22923 if (Subtarget.hasInt256()) {
22924 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
22925 In = DAG.getBitcast(MVT::v8i32, In);
22926 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
22927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
22928 DAG.getIntPtrConstant(0, DL));
22929 }
22930
22931 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22932 DAG.getIntPtrConstant(0, DL));
22933 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22934 DAG.getIntPtrConstant(2, DL));
22935 static const int ShufMask[] = {0, 2, 4, 6};
22936 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
22937 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
22938 }
22939
22940 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
22941 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
22942 if (Subtarget.hasInt256()) {
22943 // The PSHUFB mask:
22944 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
22945 -1, -1, -1, -1, -1, -1, -1, -1,
22946 16, 17, 20, 21, 24, 25, 28, 29,
22947 -1, -1, -1, -1, -1, -1, -1, -1 };
22948 In = DAG.getBitcast(MVT::v32i8, In);
22949 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
22950 In = DAG.getBitcast(MVT::v4i64, In);
22951
22952 static const int ShufMask2[] = {0, 2, -1, -1};
22953 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
22954 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22955 DAG.getIntPtrConstant(0, DL));
22956 return DAG.getBitcast(MVT::v8i16, In);
22957 }
22958
22959 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22960 DAG.getIntPtrConstant(0, DL));
22961 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22962 DAG.getIntPtrConstant(4, DL));
22963
22964 // The PSHUFB mask:
22965 static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
22966
22967 OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
22968 OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
22969
22970 OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
22971 OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
22972
22973 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
22974 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
22975
22976 // The MOVLHPS Mask:
22977 static const int ShufMask2[] = {0, 1, 4, 5};
22978 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
22979 return DAG.getBitcast(MVT::v8i16, res);
22980 }
22981
22982 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
22983 // Use an AND to zero uppper bits for PACKUS.
22984 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
22985
22986 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22987 DAG.getIntPtrConstant(0, DL));
22988 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22989 DAG.getIntPtrConstant(8, DL));
22990 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
22991 }
22992
22993 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22993)
;
22994}
22995
22996// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
22997// behaves on out of range inputs to generate optimized conversions.
22998static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
22999 SelectionDAG &DAG,
23000 const X86Subtarget &Subtarget) {
23001 MVT SrcVT = Src.getSimpleValueType();
23002 unsigned DstBits = VT.getScalarSizeInBits();
23003 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23003, __extension__
__PRETTY_FUNCTION__))
;
23004
23005 // Calculate the converted result for values in the range 0 to
23006 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23007 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
23008 SDValue Big =
23009 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
23010 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
23011 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
23012
23013 // The "CVTTP2SI" instruction conveniently sets the sign bit if
23014 // and only if the value was out of range. So we can use that
23015 // as our indicator that we rather use "Big" instead of "Small".
23016 //
23017 // Use "Small" if "IsOverflown" has all bits cleared
23018 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23019
23020 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
23021 // use the slightly slower blendv select instead.
23022 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
23023 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
23024 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
23025 }
23026
23027 SDValue IsOverflown =
23028 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
23029 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
23030 return DAG.getNode(ISD::OR, dl, VT, Small,
23031 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23032}
23033
23034SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
23035 bool IsStrict = Op->isStrictFPOpcode();
23036 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
23037 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
23038 MVT VT = Op->getSimpleValueType(0);
23039 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23040 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
23041 MVT SrcVT = Src.getSimpleValueType();
23042 SDLoc dl(Op);
23043
23044 SDValue Res;
23045 if (isSoftFP16(SrcVT)) {
23046 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
23047 if (IsStrict)
23048 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
23049 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
23050 {NVT, MVT::Other}, {Chain, Src})});
23051 return DAG.getNode(Op.getOpcode(), dl, VT,
23052 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
23053 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
23054 return Op;
23055 }
23056
23057 if (VT.isVector()) {
23058 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
23059 MVT ResVT = MVT::v4i32;
23060 MVT TruncVT = MVT::v4i1;
23061 unsigned Opc;
23062 if (IsStrict)
23063 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
23064 else
23065 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23066
23067 if (!IsSigned && !Subtarget.hasVLX()) {
23068 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23068, __extension__
__PRETTY_FUNCTION__))
;
23069 // Widen to 512-bits.
23070 ResVT = MVT::v8i32;
23071 TruncVT = MVT::v8i1;
23072 Opc = Op.getOpcode();
23073 // Need to concat with zero vector for strict fp to avoid spurious
23074 // exceptions.
23075 // TODO: Should we just do this for non-strict as well?
23076 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
23077 : DAG.getUNDEF(MVT::v8f64);
23078 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
23079 DAG.getIntPtrConstant(0, dl));
23080 }
23081 if (IsStrict) {
23082 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
23083 Chain = Res.getValue(1);
23084 } else {
23085 Res = DAG.getNode(Opc, dl, ResVT, Src);
23086 }
23087
23088 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
23089 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
23090 DAG.getIntPtrConstant(0, dl));
23091 if (IsStrict)
23092 return DAG.getMergeValues({Res, Chain}, dl);
23093 return Res;
23094 }
23095
23096 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
23097 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
23098 return Op;
23099
23100 MVT ResVT = VT;
23101 MVT EleVT = VT.getVectorElementType();
23102 if (EleVT != MVT::i64)
23103 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
23104
23105 if (SrcVT != MVT::v8f16) {
23106 SDValue Tmp =
23107 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
23108 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
23109 Ops[0] = Src;
23110 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
23111 }
23112
23113 if (IsStrict) {
23114 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
23115 : X86ISD::STRICT_CVTTP2UI,
23116 dl, {ResVT, MVT::Other}, {Chain, Src});
23117 Chain = Res.getValue(1);
23118 } else {
23119 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
23120 ResVT, Src);
23121 }
23122
23123 // TODO: Need to add exception check code for strict FP.
23124 if (EleVT.getSizeInBits() < 16) {
23125 ResVT = MVT::getVectorVT(EleVT, 8);
23126 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
23127 }
23128
23129 if (ResVT != VT)
23130 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23131 DAG.getIntPtrConstant(0, dl));
23132
23133 if (IsStrict)
23134 return DAG.getMergeValues({Res, Chain}, dl);
23135 return Res;
23136 }
23137
23138 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
23139 if (VT.getVectorElementType() == MVT::i16) {
23140 assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))
23141 SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))
23142 "Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))
;
23143 MVT NVT = VT.changeVectorElementType(MVT::i32);
23144 if (IsStrict) {
23145 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
23146 : ISD::STRICT_FP_TO_UINT,
23147 dl, {NVT, MVT::Other}, {Chain, Src});
23148 Chain = Res.getValue(1);
23149 } else {
23150 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
23151 NVT, Src);
23152 }
23153
23154 // TODO: Need to add exception check code for strict FP.
23155 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23156
23157 if (IsStrict)
23158 return DAG.getMergeValues({Res, Chain}, dl);
23159 return Res;
23160 }
23161
23162 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
23163 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
23164 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23164, __extension__
__PRETTY_FUNCTION__))
;
23165 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23165, __extension__
__PRETTY_FUNCTION__))
;
23166 return Op;
23167 }
23168
23169 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
23170 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
23171 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
23172 Subtarget.useAVX512Regs()) {
23173 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23173, __extension__
__PRETTY_FUNCTION__))
;
23174 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23174, __extension__
__PRETTY_FUNCTION__))
;
23175 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
23176 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
23177 // Need to concat with zero vector for strict fp to avoid spurious
23178 // exceptions.
23179 // TODO: Should we just do this for non-strict as well?
23180 SDValue Tmp =
23181 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23182 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23183 DAG.getIntPtrConstant(0, dl));
23184
23185 if (IsStrict) {
23186 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
23187 {Chain, Src});
23188 Chain = Res.getValue(1);
23189 } else {
23190 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
23191 }
23192
23193 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23194 DAG.getIntPtrConstant(0, dl));
23195
23196 if (IsStrict)
23197 return DAG.getMergeValues({Res, Chain}, dl);
23198 return Res;
23199 }
23200
23201 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
23202 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
23203 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
23204 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
23205 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23205, __extension__
__PRETTY_FUNCTION__))
;
23206 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
23207 // Need to concat with zero vector for strict fp to avoid spurious
23208 // exceptions.
23209 // TODO: Should we just do this for non-strict as well?
23210 SDValue Tmp =
23211 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23212 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23213 DAG.getIntPtrConstant(0, dl));
23214
23215 if (IsStrict) {
23216 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23217 {Chain, Src});
23218 Chain = Res.getValue(1);
23219 } else {
23220 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
23221 }
23222
23223 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23224 DAG.getIntPtrConstant(0, dl));
23225
23226 if (IsStrict)
23227 return DAG.getMergeValues({Res, Chain}, dl);
23228 return Res;
23229 }
23230
23231 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
23232 if (!Subtarget.hasVLX()) {
23233 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
23234 // legalizer and then widened again by vector op legalization.
23235 if (!IsStrict)
23236 return SDValue();
23237
23238 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
23239 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
23240 {Src, Zero, Zero, Zero});
23241 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23242 {Chain, Tmp});
23243 SDValue Chain = Tmp.getValue(1);
23244 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
23245 DAG.getIntPtrConstant(0, dl));
23246 return DAG.getMergeValues({Tmp, Chain}, dl);
23247 }
23248
23249 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23249, __extension__
__PRETTY_FUNCTION__))
;
23250 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23251 DAG.getUNDEF(MVT::v2f32));
23252 if (IsStrict) {
23253 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
23254 : X86ISD::STRICT_CVTTP2UI;
23255 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
23256 }
23257 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23258 return DAG.getNode(Opc, dl, VT, Tmp);
23259 }
23260
23261 // Generate optimized instructions for pre AVX512 unsigned conversions from
23262 // vXf32 to vXi32.
23263 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
23264 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
23265 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
23266 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23266, __extension__
__PRETTY_FUNCTION__))
;
23267 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
23268 }
23269
23270 return SDValue();
23271 }
23272
23273 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23273, __extension__ __PRETTY_FUNCTION__))
;
23274
23275 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
23276
23277 if (!IsSigned && UseSSEReg) {
23278 // Conversions from f32/f64 with AVX512 should be legal.
23279 if (Subtarget.hasAVX512())
23280 return Op;
23281
23282 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
23283 // behaves on out of range inputs to generate optimized conversions.
23284 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
23285 (VT == MVT::i64 && Subtarget.is64Bit()))) {
23286 unsigned DstBits = VT.getScalarSizeInBits();
23287 APInt UIntLimit = APInt::getSignMask(DstBits);
23288 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
23289 DAG.getConstant(UIntLimit, dl, VT));
23290 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
23291
23292 // Calculate the converted result for values in the range:
23293 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23294 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
23295 SDValue Small =
23296 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
23297 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
23298 SDValue Big = DAG.getNode(
23299 X86ISD::CVTTS2SI, dl, VT,
23300 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
23301 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
23302
23303 // The "CVTTS2SI" instruction conveniently sets the sign bit if
23304 // and only if the value was out of range. So we can use that
23305 // as our indicator that we rather use "Big" instead of "Small".
23306 //
23307 // Use "Small" if "IsOverflown" has all bits cleared
23308 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23309 SDValue IsOverflown = DAG.getNode(
23310 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
23311 return DAG.getNode(ISD::OR, dl, VT, Small,
23312 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23313 }
23314
23315 // Use default expansion for i64.
23316 if (VT == MVT::i64)
23317 return SDValue();
23318
23319 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23319, __extension__
__PRETTY_FUNCTION__))
;
23320
23321 // Promote i32 to i64 and use a signed operation on 64-bit targets.
23322 // FIXME: This does not generate an invalid exception if the input does not
23323 // fit in i32. PR44019
23324 if (Subtarget.is64Bit()) {
23325 if (IsStrict) {
23326 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
23327 {Chain, Src});
23328 Chain = Res.getValue(1);
23329 } else
23330 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
23331
23332 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23333 if (IsStrict)
23334 return DAG.getMergeValues({Res, Chain}, dl);
23335 return Res;
23336 }
23337
23338 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
23339 // use fisttp which will be handled later.
23340 if (!Subtarget.hasSSE3())
23341 return SDValue();
23342 }
23343
23344 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
23345 // FIXME: This does not generate an invalid exception if the input does not
23346 // fit in i16. PR44019
23347 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
23348 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23348, __extension__
__PRETTY_FUNCTION__))
;
23349 if (IsStrict) {
23350 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
23351 {Chain, Src});
23352 Chain = Res.getValue(1);
23353 } else
23354 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
23355
23356 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23357 if (IsStrict)
23358 return DAG.getMergeValues({Res, Chain}, dl);
23359 return Res;
23360 }
23361
23362 // If this is a FP_TO_SINT using SSEReg we're done.
23363 if (UseSSEReg && IsSigned)
23364 return Op;
23365
23366 // fp128 needs to use a libcall.
23367 if (SrcVT == MVT::f128) {
23368 RTLIB::Libcall LC;
23369 if (IsSigned)
23370 LC = RTLIB::getFPTOSINT(SrcVT, VT);
23371 else
23372 LC = RTLIB::getFPTOUINT(SrcVT, VT);
23373
23374 MakeLibCallOptions CallOptions;
23375 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
23376 SDLoc(Op), Chain);
23377
23378 if (IsStrict)
23379 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
23380
23381 return Tmp.first;
23382 }
23383
23384 // Fall back to X87.
23385 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
23386 if (IsStrict)
23387 return DAG.getMergeValues({V, Chain}, dl);
23388 return V;
23389 }
23390
23391 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23391)
;
23392}
23393
23394SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
23395 SelectionDAG &DAG) const {
23396 SDValue Src = Op.getOperand(0);
23397 MVT SrcVT = Src.getSimpleValueType();
23398
23399 if (SrcVT == MVT::f16)
23400 return SDValue();
23401
23402 // If the source is in an SSE register, the node is Legal.
23403 if (isScalarFPTypeInSSEReg(SrcVT))
23404 return Op;
23405
23406 return LRINT_LLRINTHelper(Op.getNode(), DAG);
23407}
23408
23409SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
23410 SelectionDAG &DAG) const {
23411 EVT DstVT = N->getValueType(0);
23412 SDValue Src = N->getOperand(0);
23413 EVT SrcVT = Src.getValueType();
23414
23415 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
23416 // f16 must be promoted before using the lowering in this routine.
23417 // fp128 does not use this lowering.
23418 return SDValue();
23419 }
23420
23421 SDLoc DL(N);
23422 SDValue Chain = DAG.getEntryNode();
23423
23424 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
23425
23426 // If we're converting from SSE, the stack slot needs to hold both types.
23427 // Otherwise it only needs to hold the DstVT.
23428 EVT OtherVT = UseSSE ? SrcVT : DstVT;
23429 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
23430 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23431 MachinePointerInfo MPI =
23432 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
23433
23434 if (UseSSE) {
23435 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23435, __extension__
__PRETTY_FUNCTION__))
;
23436 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
23437 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
23438 SDValue Ops[] = { Chain, StackPtr };
23439
23440 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
23441 /*Align*/ std::nullopt,
23442 MachineMemOperand::MOLoad);
23443 Chain = Src.getValue(1);
23444 }
23445
23446 SDValue StoreOps[] = { Chain, Src, StackPtr };
23447 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
23448 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
23449 MachineMemOperand::MOStore);
23450
23451 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
23452}
23453
23454SDValue
23455X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
23456 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
23457 // but making use of X86 specifics to produce better instruction sequences.
23458 SDNode *Node = Op.getNode();
23459 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
23460 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
23461 SDLoc dl(SDValue(Node, 0));
23462 SDValue Src = Node->getOperand(0);
23463
23464 // There are three types involved here: SrcVT is the source floating point
23465 // type, DstVT is the type of the result, and TmpVT is the result of the
23466 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
23467 // DstVT).
23468 EVT SrcVT = Src.getValueType();
23469 EVT DstVT = Node->getValueType(0);
23470 EVT TmpVT = DstVT;
23471
23472 // This code is only for floats and doubles. Fall back to generic code for
23473 // anything else.
23474 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
23475 return SDValue();
23476
23477 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
23478 unsigned SatWidth = SatVT.getScalarSizeInBits();
23479 unsigned DstWidth = DstVT.getScalarSizeInBits();
23480 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
23481 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__
__PRETTY_FUNCTION__))
23482 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__
__PRETTY_FUNCTION__))
;
23483
23484 // Promote result of FP_TO_*INT to at least 32 bits.
23485 if (TmpWidth < 32) {
23486 TmpVT = MVT::i32;
23487 TmpWidth = 32;
23488 }
23489
23490 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
23491 // us to use a native signed conversion instead.
23492 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
23493 TmpVT = MVT::i64;
23494 TmpWidth = 64;
23495 }
23496
23497 // If the saturation width is smaller than the size of the temporary result,
23498 // we can always use signed conversion, which is native.
23499 if (SatWidth < TmpWidth)
23500 FpToIntOpcode = ISD::FP_TO_SINT;
23501
23502 // Determine minimum and maximum integer values and their corresponding
23503 // floating-point values.
23504 APInt MinInt, MaxInt;
23505 if (IsSigned) {
23506 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
23507 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
23508 } else {
23509 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
23510 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
23511 }
23512
23513 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23514 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23515
23516 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
23517 MinInt, IsSigned, APFloat::rmTowardZero);
23518 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
23519 MaxInt, IsSigned, APFloat::rmTowardZero);
23520 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
23521 && !(MaxStatus & APFloat::opStatus::opInexact);
23522
23523 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
23524 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
23525
23526 // If the integer bounds are exactly representable as floats, emit a
23527 // min+max+fptoi sequence. Otherwise use comparisons and selects.
23528 if (AreExactFloatBounds) {
23529 if (DstVT != TmpVT) {
23530 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
23531 SDValue MinClamped = DAG.getNode(
23532 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
23533 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
23534 SDValue BothClamped = DAG.getNode(
23535 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
23536 // Convert clamped value to integer.
23537 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
23538
23539 // NaN will become INDVAL, with the top bit set and the rest zero.
23540 // Truncation will discard the top bit, resulting in zero.
23541 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23542 }
23543
23544 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
23545 SDValue MinClamped = DAG.getNode(
23546 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
23547 // Clamp by MaxFloat from above. NaN cannot occur.
23548 SDValue BothClamped = DAG.getNode(
23549 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
23550 // Convert clamped value to integer.
23551 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
23552
23553 if (!IsSigned) {
23554 // In the unsigned case we're done, because we mapped NaN to MinFloat,
23555 // which is zero.
23556 return FpToInt;
23557 }
23558
23559 // Otherwise, select zero if Src is NaN.
23560 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23561 return DAG.getSelectCC(
23562 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
23563 }
23564
23565 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
23566 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
23567
23568 // Result of direct conversion, which may be selected away.
23569 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
23570
23571 if (DstVT != TmpVT) {
23572 // NaN will become INDVAL, with the top bit set and the rest zero.
23573 // Truncation will discard the top bit, resulting in zero.
23574 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23575 }
23576
23577 SDValue Select = FpToInt;
23578 // For signed conversions where we saturate to the same size as the
23579 // result type of the fptoi instructions, INDVAL coincides with integer
23580 // minimum, so we don't need to explicitly check it.
23581 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
23582 // If Src ULT MinFloat, select MinInt. In particular, this also selects
23583 // MinInt if Src is NaN.
23584 Select = DAG.getSelectCC(
23585 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
23586 }
23587
23588 // If Src OGT MaxFloat, select MaxInt.
23589 Select = DAG.getSelectCC(
23590 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
23591
23592 // In the unsigned case we are done, because we mapped NaN to MinInt, which
23593 // is already zero. The promoted case was already handled above.
23594 if (!IsSigned || DstVT != TmpVT) {
23595 return Select;
23596 }
23597
23598 // Otherwise, select 0 if Src is NaN.
23599 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23600 return DAG.getSelectCC(
23601 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
23602}
23603
23604SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
23605 bool IsStrict = Op->isStrictFPOpcode();
23606
23607 SDLoc DL(Op);
23608 MVT VT = Op.getSimpleValueType();
23609 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23610 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23611 MVT SVT = In.getSimpleValueType();
23612
23613 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
23614 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
23615 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
23616 !Subtarget.getTargetTriple().isOSDarwin()))
23617 return SDValue();
23618
23619 if (SVT == MVT::f16) {
23620 if (Subtarget.hasFP16())
23621 return Op;
23622
23623 if (VT != MVT::f32) {
23624 if (IsStrict)
23625 return DAG.getNode(
23626 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
23627 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
23628 {MVT::f32, MVT::Other}, {Chain, In})});
23629
23630 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
23631 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
23632 }
23633
23634 if (!Subtarget.hasF16C()) {
23635 if (!Subtarget.getTargetTriple().isOSDarwin())
23636 return SDValue();
23637
23638 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23638, __extension__
__PRETTY_FUNCTION__))
;
23639
23640 // Need a libcall, but ABI for f16 is soft-float on MacOS.
23641 TargetLowering::CallLoweringInfo CLI(DAG);
23642 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23643
23644 In = DAG.getBitcast(MVT::i16, In);
23645 TargetLowering::ArgListTy Args;
23646 TargetLowering::ArgListEntry Entry;
23647 Entry.Node = In;
23648 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
23649 Entry.IsSExt = false;
23650 Entry.IsZExt = true;
23651 Args.push_back(Entry);
23652
23653 SDValue Callee = DAG.getExternalSymbol(
23654 getLibcallName(RTLIB::FPEXT_F16_F32),
23655 getPointerTy(DAG.getDataLayout()));
23656 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23657 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
23658 std::move(Args));
23659
23660 SDValue Res;
23661 std::tie(Res,Chain) = LowerCallTo(CLI);
23662 if (IsStrict)
23663 Res = DAG.getMergeValues({Res, Chain}, DL);
23664
23665 return Res;
23666 }
23667
23668 In = DAG.getBitcast(MVT::i16, In);
23669 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
23670 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
23671 DAG.getIntPtrConstant(0, DL));
23672 SDValue Res;
23673 if (IsStrict) {
23674 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
23675 {Chain, In});
23676 Chain = Res.getValue(1);
23677 } else {
23678 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
23679 DAG.getTargetConstant(4, DL, MVT::i32));
23680 }
23681 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
23682 DAG.getIntPtrConstant(0, DL));
23683 if (IsStrict)
23684 return DAG.getMergeValues({Res, Chain}, DL);
23685 return Res;
23686 }
23687
23688 if (!SVT.isVector())
23689 return Op;
23690
23691 if (SVT.getVectorElementType() == MVT::f16) {
23692 assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23692, __extension__
__PRETTY_FUNCTION__))
;
23693 if (SVT == MVT::v2f16)
23694 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
23695 DAG.getUNDEF(MVT::v2f16));
23696 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
23697 DAG.getUNDEF(MVT::v4f16));
23698 if (IsStrict)
23699 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23700 {Op->getOperand(0), Res});
23701 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23702 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
23703 return Op;
23704 }
23705
23706 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23706, __extension__
__PRETTY_FUNCTION__))
;
23707
23708 SDValue Res =
23709 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
23710 if (IsStrict)
23711 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23712 {Op->getOperand(0), Res});
23713 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23714}
23715
23716SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
23717 bool IsStrict = Op->isStrictFPOpcode();
23718
23719 SDLoc DL(Op);
23720 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23721 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23722 MVT VT = Op.getSimpleValueType();
23723 MVT SVT = In.getSimpleValueType();
23724
23725 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
23726 return SDValue();
23727
23728 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
23729 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
23730 if (!Subtarget.getTargetTriple().isOSDarwin())
23731 return SDValue();
23732
23733 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
23734 TargetLowering::CallLoweringInfo CLI(DAG);
23735 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23736
23737 TargetLowering::ArgListTy Args;
23738 TargetLowering::ArgListEntry Entry;
23739 Entry.Node = In;
23740 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
23741 Entry.IsSExt = false;
23742 Entry.IsZExt = true;
23743 Args.push_back(Entry);
23744
23745 SDValue Callee = DAG.getExternalSymbol(
23746 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
23747 : RTLIB::FPROUND_F32_F16),
23748 getPointerTy(DAG.getDataLayout()));
23749 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23750 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
23751 std::move(Args));
23752
23753 SDValue Res;
23754 std::tie(Res, Chain) = LowerCallTo(CLI);
23755
23756 Res = DAG.getBitcast(MVT::f16, Res);
23757
23758 if (IsStrict)
23759 Res = DAG.getMergeValues({Res, Chain}, DL);
23760
23761 return Res;
23762 }
23763
23764 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
23765 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
23766 return SDValue();
23767
23768 if (VT.isVector())
23769 return Op;
23770
23771 SDValue Res;
23772 SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
23773 MVT::i32);
23774 if (IsStrict) {
23775 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
23776 DAG.getConstantFP(0, DL, MVT::v4f32), In,
23777 DAG.getIntPtrConstant(0, DL));
23778 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
23779 {Chain, Res, Rnd});
23780 Chain = Res.getValue(1);
23781 } else {
23782 // FIXME: Should we use zeros for upper elements for non-strict?
23783 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
23784 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
23785 }
23786
23787 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
23788 DAG.getIntPtrConstant(0, DL));
23789 Res = DAG.getBitcast(MVT::f16, Res);
23790
23791 if (IsStrict)
23792 return DAG.getMergeValues({Res, Chain}, DL);
23793
23794 return Res;
23795 }
23796
23797 return Op;
23798}
23799
23800static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
23801 bool IsStrict = Op->isStrictFPOpcode();
23802 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23803 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__
__PRETTY_FUNCTION__))
23804 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__
__PRETTY_FUNCTION__))
;
23805
23806 SDLoc dl(Op);
23807 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
23808 DAG.getConstant(0, dl, MVT::v8i16), Src,
23809 DAG.getIntPtrConstant(0, dl));
23810
23811 SDValue Chain;
23812 if (IsStrict) {
23813 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
23814 {Op.getOperand(0), Res});
23815 Chain = Res.getValue(1);
23816 } else {
23817 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
23818 }
23819
23820 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
23821 DAG.getIntPtrConstant(0, dl));
23822
23823 if (IsStrict)
23824 return DAG.getMergeValues({Res, Chain}, dl);
23825
23826 return Res;
23827}
23828
23829static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
23830 bool IsStrict = Op->isStrictFPOpcode();
23831 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23832 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__
__PRETTY_FUNCTION__))
23833 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__
__PRETTY_FUNCTION__))
;
23834
23835 SDLoc dl(Op);
23836 SDValue Res, Chain;
23837 if (IsStrict) {
23838 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
23839 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
23840 DAG.getIntPtrConstant(0, dl));
23841 Res = DAG.getNode(
23842 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
23843 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
23844 Chain = Res.getValue(1);
23845 } else {
23846 // FIXME: Should we use zeros for upper elements for non-strict?
23847 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
23848 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
23849 DAG.getTargetConstant(4, dl, MVT::i32));
23850 }
23851
23852 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
23853 DAG.getIntPtrConstant(0, dl));
23854
23855 if (IsStrict)
23856 return DAG.getMergeValues({Res, Chain}, dl);
23857
23858 return Res;
23859}
23860
23861SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
23862 SelectionDAG &DAG) const {
23863 SDLoc DL(Op);
23864 MakeLibCallOptions CallOptions;
23865 RTLIB::Libcall LC =
23866 RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
23867 SDValue Res =
23868 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
23869 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
23870 DAG.getBitcast(MVT::i32, Res));
23871}
23872
23873/// Depending on uarch and/or optimizing for size, we might prefer to use a
23874/// vector operation in place of the typical scalar operation.
23875static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
23876 const X86Subtarget &Subtarget) {
23877 // If both operands have other uses, this is probably not profitable.
23878 SDValue LHS = Op.getOperand(0);
23879 SDValue RHS = Op.getOperand(1);
23880 if (!LHS.hasOneUse() && !RHS.hasOneUse())
23881 return Op;
23882
23883 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
23884 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
23885 if (IsFP && !Subtarget.hasSSE3())
23886 return Op;
23887 if (!IsFP && !Subtarget.hasSSSE3())
23888 return Op;
23889
23890 // Extract from a common vector.
23891 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23892 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23893 LHS.getOperand(0) != RHS.getOperand(0) ||
23894 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
23895 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
23896 !shouldUseHorizontalOp(true, DAG, Subtarget))
23897 return Op;
23898
23899 // Allow commuted 'hadd' ops.
23900 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
23901 unsigned HOpcode;
23902 switch (Op.getOpcode()) {
23903 case ISD::ADD: HOpcode = X86ISD::HADD; break;
23904 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
23905 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
23906 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
23907 default:
23908 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23908)
;
23909 }
23910 unsigned LExtIndex = LHS.getConstantOperandVal(1);
23911 unsigned RExtIndex = RHS.getConstantOperandVal(1);
23912 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
23913 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
23914 std::swap(LExtIndex, RExtIndex);
23915
23916 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
23917 return Op;
23918
23919 SDValue X = LHS.getOperand(0);
23920 EVT VecVT = X.getValueType();
23921 unsigned BitWidth = VecVT.getSizeInBits();
23922 unsigned NumLanes = BitWidth / 128;
23923 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
23924 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__
__PRETTY_FUNCTION__))
23925 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__
__PRETTY_FUNCTION__))
;
23926
23927 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
23928 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
23929 SDLoc DL(Op);
23930 if (BitWidth == 256 || BitWidth == 512) {
23931 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
23932 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
23933 LExtIndex %= NumEltsPerLane;
23934 }
23935
23936 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
23937 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
23938 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
23939 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
23940 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
23941 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
23942 DAG.getIntPtrConstant(LExtIndex / 2, DL));
23943}
23944
23945/// Depending on uarch and/or optimizing for size, we might prefer to use a
23946/// vector operation in place of the typical scalar operation.
23947SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
23948 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__
__PRETTY_FUNCTION__))
23949 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__
__PRETTY_FUNCTION__))
;
23950 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
23951}
23952
23953/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
23954/// This mode isn't supported in hardware on X86. But as long as we aren't
23955/// compiling with trapping math, we can emulate this with
23956/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
23957static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
23958 SDValue N0 = Op.getOperand(0);
23959 SDLoc dl(Op);
23960 MVT VT = Op.getSimpleValueType();
23961
23962 // N0 += copysign(nextafter(0.5, 0.0), N0)
23963 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23964 bool Ignored;
23965 APFloat Point5Pred = APFloat(0.5f);
23966 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
23967 Point5Pred.next(/*nextDown*/true);
23968
23969 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
23970 DAG.getConstantFP(Point5Pred, dl, VT), N0);
23971 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
23972
23973 // Truncate the result to remove fraction.
23974 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
23975}
23976
23977/// The only differences between FABS and FNEG are the mask and the logic op.
23978/// FNEG also has a folding opportunity for FNEG(FABS(x)).
23979static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
23980 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__
__PRETTY_FUNCTION__))
23981 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__
__PRETTY_FUNCTION__))
;
23982
23983 bool IsFABS = (Op.getOpcode() == ISD::FABS);
23984
23985 // If this is a FABS and it has an FNEG user, bail out to fold the combination
23986 // into an FNABS. We'll lower the FABS after that if it is still in use.
23987 if (IsFABS)
23988 for (SDNode *User : Op->uses())
23989 if (User->getOpcode() == ISD::FNEG)
23990 return Op;
23991
23992 SDLoc dl(Op);
23993 MVT VT = Op.getSimpleValueType();
23994
23995 bool IsF128 = (VT == MVT::f128);
23996 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))
23997 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))
23998 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))
;
23999
24000 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
24001 // decide if we should generate a 16-byte constant mask when we only need 4 or
24002 // 8 bytes for the scalar case.
24003
24004 // There are no scalar bitwise logical SSE/AVX instructions, so we
24005 // generate a 16-byte vector constant and logic op even for the scalar case.
24006 // Using a 16-byte mask allows folding the load of the mask with
24007 // the logic op, so it can save (~4 bytes) on code size.
24008 bool IsFakeVector = !VT.isVector() && !IsF128;
24009 MVT LogicVT = VT;
24010 if (IsFakeVector)
24011 LogicVT = (VT == MVT::f64) ? MVT::v2f64
24012 : (VT == MVT::f32) ? MVT::v4f32
24013 : MVT::v8f16;
24014
24015 unsigned EltBits = VT.getScalarSizeInBits();
24016 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
24017 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
24018 APInt::getSignMask(EltBits);
24019 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
24020 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
24021
24022 SDValue Op0 = Op.getOperand(0);
24023 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
24024 unsigned LogicOp = IsFABS ? X86ISD::FAND :
24025 IsFNABS ? X86ISD::FOR :
24026 X86ISD::FXOR;
24027 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
24028
24029 if (VT.isVector() || IsF128)
24030 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
24031
24032 // For the scalar case extend to a 128-bit vector, perform the logic op,
24033 // and extract the scalar result back out.
24034 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
24035 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
24036 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
24037 DAG.getIntPtrConstant(0, dl));
24038}
24039
24040static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
24041 SDValue Mag = Op.getOperand(0);
24042 SDValue Sign = Op.getOperand(1);
24043 SDLoc dl(Op);
24044
24045 // If the sign operand is smaller, extend it first.
24046 MVT VT = Op.getSimpleValueType();
24047 if (Sign.getSimpleValueType().bitsLT(VT))
24048 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
24049
24050 // And if it is bigger, shrink it first.
24051 if (Sign.getSimpleValueType().bitsGT(VT))
24052 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
24053 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
24054
24055 // At this point the operands and the result should have the same
24056 // type, and that won't be f80 since that is not custom lowered.
24057 bool IsF128 = (VT == MVT::f128);
24058 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))
24059 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))
24060 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))
;
24061
24062 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
24063
24064 // Perform all scalar logic operations as 16-byte vectors because there are no
24065 // scalar FP logic instructions in SSE.
24066 // TODO: This isn't necessary. If we used scalar types, we might avoid some
24067 // unnecessary splats, but we might miss load folding opportunities. Should
24068 // this decision be based on OptimizeForSize?
24069 bool IsFakeVector = !VT.isVector() && !IsF128;
24070 MVT LogicVT = VT;
24071 if (IsFakeVector)
24072 LogicVT = (VT == MVT::f64) ? MVT::v2f64
24073 : (VT == MVT::f32) ? MVT::v4f32
24074 : MVT::v8f16;
24075
24076 // The mask constants are automatically splatted for vector types.
24077 unsigned EltSizeInBits = VT.getScalarSizeInBits();
24078 SDValue SignMask = DAG.getConstantFP(
24079 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
24080 SDValue MagMask = DAG.getConstantFP(
24081 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
24082
24083 // First, clear all bits but the sign bit from the second operand (sign).
24084 if (IsFakeVector)
24085 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
24086 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
24087
24088 // Next, clear the sign bit from the first operand (magnitude).
24089 // TODO: If we had general constant folding for FP logic ops, this check
24090 // wouldn't be necessary.
24091 SDValue MagBits;
24092 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
24093 APFloat APF = Op0CN->getValueAPF();
24094 APF.clearSign();
24095 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
24096 } else {
24097 // If the magnitude operand wasn't a constant, we need to AND out the sign.
24098 if (IsFakeVector)
24099 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
24100 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
24101 }
24102
24103 // OR the magnitude value with the sign bit.
24104 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
24105 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
24106 DAG.getIntPtrConstant(0, dl));
24107}
24108
24109static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
24110 SDValue N0 = Op.getOperand(0);
24111 SDLoc dl(Op);
24112 MVT VT = Op.getSimpleValueType();
24113
24114 MVT OpVT = N0.getSimpleValueType();
24115 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__
__PRETTY_FUNCTION__))
24116 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__
__PRETTY_FUNCTION__))
;
24117
24118 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
24119 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
24120 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
24121 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
24122 Res = DAG.getZExtOrTrunc(Res, dl, VT);
24123 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
24124 return Res;
24125}
24126
24127/// Helper for attempting to create a X86ISD::BT node.
24128static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
24129 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
24130 // instruction. Since the shift amount is in-range-or-undefined, we know
24131 // that doing a bittest on the i32 value is ok. We extend to i32 because
24132 // the encoding for the i16 version is larger than the i32 version.
24133 // Also promote i16 to i32 for performance / code size reason.
24134 if (Src.getValueType().getScalarSizeInBits() < 32)
24135 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
24136
24137 // No legal type found, give up.
24138 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
24139 return SDValue();
24140
24141 // See if we can use the 32-bit instruction instead of the 64-bit one for a
24142 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
24143 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
24144 // known to be zero.
24145 if (Src.getValueType() == MVT::i64 &&
24146 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
24147 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
24148
24149 // If the operand types disagree, extend the shift amount to match. Since
24150 // BT ignores high bits (like shifts) we can use anyextend.
24151 if (Src.getValueType() != BitNo.getValueType()) {
24152 // Peek through a mask/modulo operation.
24153 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
24154 // we probably need a better IsDesirableToPromoteOp to handle this as well.
24155 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
24156 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
24157 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24158 BitNo.getOperand(0)),
24159 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24160 BitNo.getOperand(1)));
24161 else
24162 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
24163 }
24164
24165 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
24166}
24167
24168/// Helper for creating a X86ISD::SETCC node.
24169static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
24170 SelectionDAG &DAG) {
24171 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
24172 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
24173}
24174
24175/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
24176/// recognizable memcmp expansion.
24177static bool isOrXorXorTree(SDValue X, bool Root = true) {
24178 if (X.getOpcode() == ISD::OR)
24179 return isOrXorXorTree(X.getOperand(0), false) &&
24180 isOrXorXorTree(X.getOperand(1), false);
24181 if (Root)
24182 return false;
24183 return X.getOpcode() == ISD::XOR;
24184}
24185
24186/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
24187/// expansion.
24188template <typename F>
24189static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
24190 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
24191 SDValue Op0 = X.getOperand(0);
24192 SDValue Op1 = X.getOperand(1);
24193 if (X.getOpcode() == ISD::OR) {
24194 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24195 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24196 if (VecVT != CmpVT)
24197 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
24198 if (HasPT)
24199 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
24200 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
24201 }
24202 if (X.getOpcode() == ISD::XOR) {
24203 SDValue A = SToV(Op0);
24204 SDValue B = SToV(Op1);
24205 if (VecVT != CmpVT)
24206 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
24207 if (HasPT)
24208 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
24209 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
24210 }
24211 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24211)
;
24212}
24213
24214/// Try to map a 128-bit or larger integer comparison to vector instructions
24215/// before type legalization splits it up into chunks.
24216static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
24217 ISD::CondCode CC,
24218 const SDLoc &DL,
24219 SelectionDAG &DAG,
24220 const X86Subtarget &Subtarget) {
24221 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24221, __extension__
__PRETTY_FUNCTION__))
;
24222
24223 // We're looking for an oversized integer equality comparison.
24224 EVT OpVT = X.getValueType();
24225 unsigned OpSize = OpVT.getSizeInBits();
24226 if (!OpVT.isScalarInteger() || OpSize < 128)
24227 return SDValue();
24228
24229 // Ignore a comparison with zero because that gets special treatment in
24230 // EmitTest(). But make an exception for the special case of a pair of
24231 // logically-combined vector-sized operands compared to zero. This pattern may
24232 // be generated by the memcmp expansion pass with oversized integer compares
24233 // (see PR33325).
24234 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
24235 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
24236 return SDValue();
24237
24238 // Don't perform this combine if constructing the vector will be expensive.
24239 auto IsVectorBitCastCheap = [](SDValue X) {
24240 X = peekThroughBitcasts(X);
24241 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
24242 X.getOpcode() == ISD::LOAD;
24243 };
24244 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
24245 !IsOrXorXorTreeCCZero)
24246 return SDValue();
24247
24248 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
24249 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
24250 // Otherwise use PCMPEQ (plus AND) and mask testing.
24251 bool NoImplicitFloatOps =
24252 DAG.getMachineFunction().getFunction().hasFnAttribute(
24253 Attribute::NoImplicitFloat);
24254 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
24255 ((OpSize == 128 && Subtarget.hasSSE2()) ||
24256 (OpSize == 256 && Subtarget.hasAVX()) ||
24257 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
24258 bool HasPT = Subtarget.hasSSE41();
24259
24260 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
24261 // vector registers are essentially free. (Technically, widening registers
24262 // prevents load folding, but the tradeoff is worth it.)
24263 bool PreferKOT = Subtarget.preferMaskRegisters();
24264 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
24265
24266 EVT VecVT = MVT::v16i8;
24267 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
24268 if (OpSize == 256) {
24269 VecVT = MVT::v32i8;
24270 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
24271 }
24272 EVT CastVT = VecVT;
24273 bool NeedsAVX512FCast = false;
24274 if (OpSize == 512 || NeedZExt) {
24275 if (Subtarget.hasBWI()) {
24276 VecVT = MVT::v64i8;
24277 CmpVT = MVT::v64i1;
24278 if (OpSize == 512)
24279 CastVT = VecVT;
24280 } else {
24281 VecVT = MVT::v16i32;
24282 CmpVT = MVT::v16i1;
24283 CastVT = OpSize == 512 ? VecVT
24284 : OpSize == 256 ? MVT::v8i32
24285 : MVT::v4i32;
24286 NeedsAVX512FCast = true;
24287 }
24288 }
24289
24290 auto ScalarToVector = [&](SDValue X) -> SDValue {
24291 bool TmpZext = false;
24292 EVT TmpCastVT = CastVT;
24293 if (X.getOpcode() == ISD::ZERO_EXTEND) {
24294 SDValue OrigX = X.getOperand(0);
24295 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
24296 if (OrigSize < OpSize) {
24297 if (OrigSize == 128) {
24298 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
24299 X = OrigX;
24300 TmpZext = true;
24301 } else if (OrigSize == 256) {
24302 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
24303 X = OrigX;
24304 TmpZext = true;
24305 }
24306 }
24307 }
24308 X = DAG.getBitcast(TmpCastVT, X);
24309 if (!NeedZExt && !TmpZext)
24310 return X;
24311 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
24312 DAG.getConstant(0, DL, VecVT), X,
24313 DAG.getVectorIdxConstant(0, DL));
24314 };
24315
24316 SDValue Cmp;
24317 if (IsOrXorXorTreeCCZero) {
24318 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
24319 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
24320 // Use 2 vector equality compares and 'and' the results before doing a
24321 // MOVMSK.
24322 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
24323 } else {
24324 SDValue VecX = ScalarToVector(X);
24325 SDValue VecY = ScalarToVector(Y);
24326 if (VecVT != CmpVT) {
24327 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
24328 } else if (HasPT) {
24329 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
24330 } else {
24331 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
24332 }
24333 }
24334 // AVX512 should emit a setcc that will lower to kortest.
24335 if (VecVT != CmpVT) {
24336 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
24337 : CmpVT == MVT::v32i1 ? MVT::i32
24338 : MVT::i16;
24339 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
24340 DAG.getConstant(0, DL, KRegVT), CC);
24341 }
24342 if (HasPT) {
24343 SDValue BCCmp =
24344 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
24345 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
24346 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24347 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
24348 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
24349 }
24350 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
24351 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
24352 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
24353 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__
__PRETTY_FUNCTION__))
24354 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__
__PRETTY_FUNCTION__))
;
24355 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
24356 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
24357 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
24358 }
24359
24360 return SDValue();
24361}
24362
24363/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
24364/// style scalarized (associative) reduction patterns. Partial reductions
24365/// are supported when the pointer SrcMask is non-null.
24366/// TODO - move this to SelectionDAG?
24367static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
24368 SmallVectorImpl<SDValue> &SrcOps,
24369 SmallVectorImpl<APInt> *SrcMask = nullptr) {
24370 SmallVector<SDValue, 8> Opnds;
24371 DenseMap<SDValue, APInt> SrcOpMap;
24372 EVT VT = MVT::Other;
24373
24374 // Recognize a special case where a vector is casted into wide integer to
24375 // test all 0s.
24376 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__
__PRETTY_FUNCTION__))
24377 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__
__PRETTY_FUNCTION__))
;
24378 Opnds.push_back(Op.getOperand(0));
24379 Opnds.push_back(Op.getOperand(1));
24380
24381 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
24382 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
24383 // BFS traverse all BinOp operands.
24384 if (I->getOpcode() == unsigned(BinOp)) {
24385 Opnds.push_back(I->getOperand(0));
24386 Opnds.push_back(I->getOperand(1));
24387 // Re-evaluate the number of nodes to be traversed.
24388 e += 2; // 2 more nodes (LHS and RHS) are pushed.
24389 continue;
24390 }
24391
24392 // Quit if a non-EXTRACT_VECTOR_ELT
24393 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24394 return false;
24395
24396 // Quit if without a constant index.
24397 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
24398 if (!Idx)
24399 return false;
24400
24401 SDValue Src = I->getOperand(0);
24402 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
24403 if (M == SrcOpMap.end()) {
24404 VT = Src.getValueType();
24405 // Quit if not the same type.
24406 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
24407 return false;
24408 unsigned NumElts = VT.getVectorNumElements();
24409 APInt EltCount = APInt::getZero(NumElts);
24410 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
24411 SrcOps.push_back(Src);
24412 }
24413
24414 // Quit if element already used.
24415 unsigned CIdx = Idx->getZExtValue();
24416 if (M->second[CIdx])
24417 return false;
24418 M->second.setBit(CIdx);
24419 }
24420
24421 if (SrcMask) {
24422 // Collect the source partial masks.
24423 for (SDValue &SrcOp : SrcOps)
24424 SrcMask->push_back(SrcOpMap[SrcOp]);
24425 } else {
24426 // Quit if not all elements are used.
24427 for (const auto &I : SrcOpMap)
24428 if (!I.second.isAllOnes())
24429 return false;
24430 }
24431
24432 return true;
24433}
24434
24435// Helper function for comparing all bits of two vectors.
24436static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
24437 ISD::CondCode CC, const APInt &OriginalMask,
24438 const X86Subtarget &Subtarget,
24439 SelectionDAG &DAG, X86::CondCode &X86CC) {
24440 EVT VT = LHS.getValueType();
24441 unsigned ScalarSize = VT.getScalarSizeInBits();
24442 if (OriginalMask.getBitWidth() != ScalarSize) {
24443 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24443, __extension__
__PRETTY_FUNCTION__))
;
24444 return SDValue();
24445 }
24446
24447 // Quit if not convertable to legal scalar or 128/256-bit vector.
24448 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24449 return SDValue();
24450
24451 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
24452 if (VT.isFloatingPoint())
24453 return SDValue();
24454
24455 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24455, __extension__
__PRETTY_FUNCTION__))
;
24456 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
24457
24458 APInt Mask = OriginalMask;
24459
24460 auto MaskBits = [&](SDValue Src) {
24461 if (Mask.isAllOnes())
24462 return Src;
24463 EVT SrcVT = Src.getValueType();
24464 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
24465 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
24466 };
24467
24468 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
24469 if (VT.getSizeInBits() < 128) {
24470 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
24471 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
24472 if (IntVT != MVT::i64)
24473 return SDValue();
24474 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
24475 MVT::i32, MVT::i32);
24476 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
24477 MVT::i32, MVT::i32);
24478 SDValue Lo =
24479 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
24480 SDValue Hi =
24481 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
24482 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24483 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
24484 DAG.getConstant(0, DL, MVT::i32));
24485 }
24486 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24487 DAG.getBitcast(IntVT, MaskBits(LHS)),
24488 DAG.getBitcast(IntVT, MaskBits(RHS)));
24489 }
24490
24491 // Without PTEST, a masked v2i64 or-reduction is not faster than
24492 // scalarization.
24493 bool UseKORTEST = Subtarget.useAVX512Regs();
24494 bool UsePTEST = Subtarget.hasSSE41();
24495 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
24496 return SDValue();
24497
24498 // Split down to 128/256/512-bit vector.
24499 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
24500
24501 // If the input vector has vector elements wider than the target test size,
24502 // then cast to <X x i64> so it will safely split.
24503 if (ScalarSize > TestSize) {
24504 if (!Mask.isAllOnes())
24505 return SDValue();
24506 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
24507 LHS = DAG.getBitcast(VT, LHS);
24508 RHS = DAG.getBitcast(VT, RHS);
24509 Mask = APInt::getAllOnes(64);
24510 }
24511
24512 if (VT.getSizeInBits() > TestSize) {
24513 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
24514 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
24515 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
24516 while (VT.getSizeInBits() > TestSize) {
24517 auto Split = DAG.SplitVector(LHS, DL);
24518 VT = Split.first.getValueType();
24519 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24520 }
24521 RHS = DAG.getAllOnesConstant(DL, VT);
24522 } else if (!UsePTEST && !KnownRHS.isZero()) {
24523 // MOVMSK Special Case:
24524 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
24525 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
24526 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
24527 LHS = DAG.getBitcast(VT, MaskBits(LHS));
24528 RHS = DAG.getBitcast(VT, MaskBits(RHS));
24529 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
24530 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
24531 V = DAG.getSExtOrTrunc(V, DL, VT);
24532 while (VT.getSizeInBits() > TestSize) {
24533 auto Split = DAG.SplitVector(V, DL);
24534 VT = Split.first.getValueType();
24535 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24536 }
24537 V = DAG.getNOT(DL, V, VT);
24538 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24539 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24540 DAG.getConstant(0, DL, MVT::i32));
24541 } else {
24542 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
24543 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24544 while (VT.getSizeInBits() > TestSize) {
24545 auto Split = DAG.SplitVector(V, DL);
24546 VT = Split.first.getValueType();
24547 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
24548 }
24549 LHS = V;
24550 RHS = DAG.getConstant(0, DL, VT);
24551 }
24552 }
24553
24554 if (UseKORTEST && VT.is512BitVector()) {
24555 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
24556 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
24557 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24558 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24559 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
24560 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
24561 }
24562
24563 if (UsePTEST) {
24564 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
24565 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24566 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24567 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
24568 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
24569 }
24570
24571 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 &&
"Failure to split to 128-bits") ? void (0) : __assert_fail (
"VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24571, __extension__
__PRETTY_FUNCTION__))
;
24572 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
24573 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
24574 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
24575 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
24576 V = DAG.getNOT(DL, V, MaskVT);
24577 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24578 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24579 DAG.getConstant(0, DL, MVT::i32));
24580}
24581
24582// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
24583// to CMP(MOVMSK(PCMPEQB(X,Y))).
24584static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
24585 ISD::CondCode CC, const SDLoc &DL,
24586 const X86Subtarget &Subtarget,
24587 SelectionDAG &DAG,
24588 X86::CondCode &X86CC) {
24589 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24589, __extension__
__PRETTY_FUNCTION__))
;
24590
24591 bool CmpNull = isNullConstant(RHS);
24592 bool CmpAllOnes = isAllOnesConstant(RHS);
24593 if (!CmpNull && !CmpAllOnes)
24594 return SDValue();
24595
24596 SDValue Op = LHS;
24597 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
24598 return SDValue();
24599
24600 // Check whether we're masking/truncating an OR-reduction result, in which
24601 // case track the masked bits.
24602 // TODO: Add CmpAllOnes support.
24603 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
24604 if (CmpNull) {
24605 switch (Op.getOpcode()) {
24606 case ISD::TRUNCATE: {
24607 SDValue Src = Op.getOperand(0);
24608 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
24609 Op.getScalarValueSizeInBits());
24610 Op = Src;
24611 break;
24612 }
24613 case ISD::AND: {
24614 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
24615 Mask = Cst->getAPIntValue();
24616 Op = Op.getOperand(0);
24617 }
24618 break;
24619 }
24620 }
24621 }
24622
24623 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
24624
24625 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
24626 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
24627 SmallVector<SDValue, 8> VecIns;
24628 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
24629 EVT VT = VecIns[0].getValueType();
24630 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))
24631 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))
24632 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))
;
24633
24634 // Quit if not splittable to scalar/128/256/512-bit vector.
24635 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24636 return SDValue();
24637
24638 // If more than one full vector is evaluated, AND/OR them first before
24639 // PTEST.
24640 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
24641 Slot += 2, e += 1) {
24642 // Each iteration will AND/OR 2 nodes and append the result until there is
24643 // only 1 node left, i.e. the final value of all vectors.
24644 SDValue LHS = VecIns[Slot];
24645 SDValue RHS = VecIns[Slot + 1];
24646 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
24647 }
24648
24649 return LowerVectorAllEqual(DL, VecIns.back(),
24650 CmpNull ? DAG.getConstant(0, DL, VT)
24651 : DAG.getAllOnesConstant(DL, VT),
24652 CC, Mask, Subtarget, DAG, X86CC);
24653 }
24654
24655 // Match icmp(reduce_or(X),0) anyof reduction patterns.
24656 // Match icmp(reduce_and(X),-1) allof reduction patterns.
24657 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24658 ISD::NodeType BinOp;
24659 if (SDValue Match =
24660 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
24661 EVT MatchVT = Match.getValueType();
24662 return LowerVectorAllEqual(DL, Match,
24663 CmpNull ? DAG.getConstant(0, DL, MatchVT)
24664 : DAG.getAllOnesConstant(DL, MatchVT),
24665 CC, Mask, Subtarget, DAG, X86CC);
24666 }
24667 }
24668
24669 if (Mask.isAllOnes()) {
24670 assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__
__PRETTY_FUNCTION__))
24671 "Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__
__PRETTY_FUNCTION__))
;
24672 SDValue Src = peekThroughBitcasts(Op);
24673 if (Src.getValueType().isFixedLengthVector() &&
24674 Src.getValueType().getScalarType() == MVT::i1) {
24675 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
24676 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
24677 if (Src.getOpcode() == ISD::SETCC) {
24678 SDValue LHS = Src.getOperand(0);
24679 SDValue RHS = Src.getOperand(1);
24680 EVT LHSVT = LHS.getValueType();
24681 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
24682 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
24683 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
24684 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
24685 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
24686 X86CC);
24687 }
24688 }
24689 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
24690 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
24691 // Peek through truncation, mask the LSB and compare against zero/LSB.
24692 if (Src.getOpcode() == ISD::TRUNCATE) {
24693 SDValue Inner = Src.getOperand(0);
24694 EVT InnerVT = Inner.getValueType();
24695 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
24696 unsigned BW = InnerVT.getScalarSizeInBits();
24697 APInt SrcMask = APInt(BW, 1);
24698 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
24699 return LowerVectorAllEqual(DL, Inner,
24700 DAG.getConstant(Cmp, DL, InnerVT), CC,
24701 SrcMask, Subtarget, DAG, X86CC);
24702 }
24703 }
24704 }
24705 }
24706
24707 return SDValue();
24708}
24709
24710/// return true if \c Op has a use that doesn't just read flags.
24711static bool hasNonFlagsUse(SDValue Op) {
24712 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
24713 ++UI) {
24714 SDNode *User = *UI;
24715 unsigned UOpNo = UI.getOperandNo();
24716 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
24717 // Look pass truncate.
24718 UOpNo = User->use_begin().getOperandNo();
24719 User = *User->use_begin();
24720 }
24721
24722 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
24723 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
24724 return true;
24725 }
24726 return false;
24727}
24728
24729// Transform to an x86-specific ALU node with flags if there is a chance of
24730// using an RMW op or only the flags are used. Otherwise, leave
24731// the node alone and emit a 'cmp' or 'test' instruction.
24732static bool isProfitableToUseFlagOp(SDValue Op) {
24733 for (SDNode *U : Op->uses())
24734 if (U->getOpcode() != ISD::CopyToReg &&
24735 U->getOpcode() != ISD::SETCC &&
24736 U->getOpcode() != ISD::STORE)
24737 return false;
24738
24739 return true;
24740}
24741
24742/// Emit nodes that will be selected as "test Op0,Op0", or something
24743/// equivalent.
24744static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
24745 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
24746 // CF and OF aren't always set the way we want. Determine which
24747 // of these we need.
24748 bool NeedCF = false;
24749 bool NeedOF = false;
24750 switch (X86CC) {
24751 default: break;
24752 case X86::COND_A: case X86::COND_AE:
24753 case X86::COND_B: case X86::COND_BE:
24754 NeedCF = true;
24755 break;
24756 case X86::COND_G: case X86::COND_GE:
24757 case X86::COND_L: case X86::COND_LE:
24758 case X86::COND_O: case X86::COND_NO: {
24759 // Check if we really need to set the
24760 // Overflow flag. If NoSignedWrap is present
24761 // that is not actually needed.
24762 switch (Op->getOpcode()) {
24763 case ISD::ADD:
24764 case ISD::SUB:
24765 case ISD::MUL:
24766 case ISD::SHL:
24767 if (Op.getNode()->getFlags().hasNoSignedWrap())
24768 break;
24769 [[fallthrough]];
24770 default:
24771 NeedOF = true;
24772 break;
24773 }
24774 break;
24775 }
24776 }
24777 // See if we can use the EFLAGS value from the operand instead of
24778 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
24779 // we prove that the arithmetic won't overflow, we can't use OF or CF.
24780 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
24781 // Emit a CMP with 0, which is the TEST pattern.
24782 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24783 DAG.getConstant(0, dl, Op.getValueType()));
24784 }
24785 unsigned Opcode = 0;
24786 unsigned NumOperands = 0;
24787
24788 SDValue ArithOp = Op;
24789
24790 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
24791 // which may be the result of a CAST. We use the variable 'Op', which is the
24792 // non-casted variable when we check for possible users.
24793 switch (ArithOp.getOpcode()) {
24794 case ISD::AND:
24795 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
24796 // because a TEST instruction will be better.
24797 if (!hasNonFlagsUse(Op))
24798 break;
24799
24800 [[fallthrough]];
24801 case ISD::ADD:
24802 case ISD::SUB:
24803 case ISD::OR:
24804 case ISD::XOR:
24805 if (!isProfitableToUseFlagOp(Op))
24806 break;
24807
24808 // Otherwise use a regular EFLAGS-setting instruction.
24809 switch (ArithOp.getOpcode()) {
24810 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24810)
;
24811 case ISD::ADD: Opcode = X86ISD::ADD; break;
24812 case ISD::SUB: Opcode = X86ISD::SUB; break;
24813 case ISD::XOR: Opcode = X86ISD::XOR; break;
24814 case ISD::AND: Opcode = X86ISD::AND; break;
24815 case ISD::OR: Opcode = X86ISD::OR; break;
24816 }
24817
24818 NumOperands = 2;
24819 break;
24820 case X86ISD::ADD:
24821 case X86ISD::SUB:
24822 case X86ISD::OR:
24823 case X86ISD::XOR:
24824 case X86ISD::AND:
24825 return SDValue(Op.getNode(), 1);
24826 case ISD::SSUBO:
24827 case ISD::USUBO: {
24828 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
24829 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24830 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
24831 Op->getOperand(1)).getValue(1);
24832 }
24833 default:
24834 break;
24835 }
24836
24837 if (Opcode == 0) {
24838 // Emit a CMP with 0, which is the TEST pattern.
24839 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24840 DAG.getConstant(0, dl, Op.getValueType()));
24841 }
24842 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24843 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
24844
24845 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
24846 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
24847 return SDValue(New.getNode(), 1);
24848}
24849
24850/// Emit nodes that will be selected as "cmp Op0,Op1", or something
24851/// equivalent.
24852static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
24853 const SDLoc &dl, SelectionDAG &DAG,
24854 const X86Subtarget &Subtarget) {
24855 if (isNullConstant(Op1))
24856 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
24857
24858 EVT CmpVT = Op0.getValueType();
24859
24860 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__
__PRETTY_FUNCTION__))
24861 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__
__PRETTY_FUNCTION__))
;
24862
24863 // Only promote the compare up to I32 if it is a 16 bit operation
24864 // with an immediate. 16 bit immediates are to be avoided.
24865 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
24866 !DAG.getMachineFunction().getFunction().hasMinSize()) {
24867 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
24868 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
24869 // Don't do this if the immediate can fit in 8-bits.
24870 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
24871 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
24872 unsigned ExtendOp =
24873 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24874 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
24875 // For equality comparisons try to use SIGN_EXTEND if the input was
24876 // truncate from something with enough sign bits.
24877 if (Op0.getOpcode() == ISD::TRUNCATE) {
24878 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
24879 ExtendOp = ISD::SIGN_EXTEND;
24880 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
24881 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
24882 ExtendOp = ISD::SIGN_EXTEND;
24883 }
24884 }
24885
24886 CmpVT = MVT::i32;
24887 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
24888 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
24889 }
24890 }
24891
24892 // Try to shrink i64 compares if the input has enough zero bits.
24893 // FIXME: Do this for non-constant compares for constant on LHS?
24894 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
24895 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
24896 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
24897 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
24898 CmpVT = MVT::i32;
24899 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
24900 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
24901 }
24902
24903 // 0-x == y --> x+y == 0
24904 // 0-x != y --> x+y != 0
24905 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
24906 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24907 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24908 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
24909 return Add.getValue(1);
24910 }
24911
24912 // x == 0-y --> x+y == 0
24913 // x != 0-y --> x+y != 0
24914 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
24915 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24916 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24917 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
24918 return Add.getValue(1);
24919 }
24920
24921 // Use SUB instead of CMP to enable CSE between SUB and CMP.
24922 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24923 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
24924 return Sub.getValue(1);
24925}
24926
24927/// Check if replacement of SQRT with RSQRT should be disabled.
24928bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
24929 EVT VT = Op.getValueType();
24930
24931 // We don't need to replace SQRT with RSQRT for half type.
24932 if (VT.getScalarType() == MVT::f16)
24933 return true;
24934
24935 // We never want to use both SQRT and RSQRT instructions for the same input.
24936 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
24937 return false;
24938
24939 if (VT.isVector())
24940 return Subtarget.hasFastVectorFSQRT();
24941 return Subtarget.hasFastScalarFSQRT();
24942}
24943
24944/// The minimum architected relative accuracy is 2^-12. We need one
24945/// Newton-Raphson step to have a good float result (24 bits of precision).
24946SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
24947 SelectionDAG &DAG, int Enabled,
24948 int &RefinementSteps,
24949 bool &UseOneConstNR,
24950 bool Reciprocal) const {
24951 SDLoc DL(Op);
24952 EVT VT = Op.getValueType();
24953
24954 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
24955 // It is likely not profitable to do this for f64 because a double-precision
24956 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
24957 // instructions: convert to single, rsqrtss, convert back to double, refine
24958 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
24959 // along with FMA, this could be a throughput win.
24960 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
24961 // after legalize types.
24962 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24963 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
24964 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
24965 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24966 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24967 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24968 RefinementSteps = 1;
24969
24970 UseOneConstNR = false;
24971 // There is no FSQRT for 512-bits, but there is RSQRT14.
24972 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
24973 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
24974 if (RefinementSteps == 0 && !Reciprocal)
24975 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
24976 return Estimate;
24977 }
24978
24979 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24980 Subtarget.hasFP16()) {
24981 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24981, __extension__
__PRETTY_FUNCTION__))
;
24982 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24983 RefinementSteps = 0;
24984
24985 if (VT == MVT::f16) {
24986 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24987 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24988 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24989 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
24990 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24991 }
24992
24993 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
24994 }
24995 return SDValue();
24996}
24997
24998/// The minimum architected relative accuracy is 2^-12. We need one
24999/// Newton-Raphson step to have a good float result (24 bits of precision).
25000SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
25001 int Enabled,
25002 int &RefinementSteps) const {
25003 SDLoc DL(Op);
25004 EVT VT = Op.getValueType();
25005
25006 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
25007 // It is likely not profitable to do this for f64 because a double-precision
25008 // reciprocal estimate with refinement on x86 prior to FMA requires
25009 // 15 instructions: convert to single, rcpss, convert back to double, refine
25010 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
25011 // along with FMA, this could be a throughput win.
25012
25013 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
25014 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
25015 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
25016 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
25017 // Enable estimate codegen with 1 refinement step for vector division.
25018 // Scalar division estimates are disabled because they break too much
25019 // real-world code. These defaults are intended to match GCC behavior.
25020 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
25021 return SDValue();
25022
25023 if (RefinementSteps == ReciprocalEstimate::Unspecified)
25024 RefinementSteps = 1;
25025
25026 // There is no FSQRT for 512-bits, but there is RCP14.
25027 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
25028 return DAG.getNode(Opcode, DL, VT, Op);
25029 }
25030
25031 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
25032 Subtarget.hasFP16()) {
25033 if (RefinementSteps == ReciprocalEstimate::Unspecified)
25034 RefinementSteps = 0;
25035
25036 if (VT == MVT::f16) {
25037 SDValue Zero = DAG.getIntPtrConstant(0, DL);
25038 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
25039 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
25040 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
25041 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
25042 }
25043
25044 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
25045 }
25046 return SDValue();
25047}
25048
25049/// If we have at least two divisions that use the same divisor, convert to
25050/// multiplication by a reciprocal. This may need to be adjusted for a given
25051/// CPU if a division's cost is not at least twice the cost of a multiplication.
25052/// This is because we still need one division to calculate the reciprocal and
25053/// then we need two multiplies by that reciprocal as replacements for the
25054/// original divisions.
25055unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
25056 return 2;
25057}
25058
25059SDValue
25060X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
25061 SelectionDAG &DAG,
25062 SmallVectorImpl<SDNode *> &Created) const {
25063 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
25064 if (isIntDivCheap(N->getValueType(0), Attr))
25065 return SDValue(N,0); // Lower SDIV as SDIV
25066
25067 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__
__PRETTY_FUNCTION__))
25068 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__
__PRETTY_FUNCTION__))
;
25069
25070 // Only perform this transform if CMOV is supported otherwise the select
25071 // below will become a branch.
25072 if (!Subtarget.canUseCMOV())
25073 return SDValue();
25074
25075 // fold (sdiv X, pow2)
25076 EVT VT = N->getValueType(0);
25077 // FIXME: Support i8.
25078 if (VT != MVT::i16 && VT != MVT::i32 &&
25079 !(Subtarget.is64Bit() && VT == MVT::i64))
25080 return SDValue();
25081
25082 unsigned Lg2 = Divisor.countr_zero();
25083
25084 // If the divisor is 2 or -2, the default expansion is better.
25085 if (Lg2 == 1)
25086 return SDValue();
25087
25088 SDLoc DL(N);
25089 SDValue N0 = N->getOperand(0);
25090 SDValue Zero = DAG.getConstant(0, DL, VT);
25091 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
25092 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
25093
25094 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
25095 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
25096 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
25097 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
25098
25099 Created.push_back(Cmp.getNode());
25100 Created.push_back(Add.getNode());
25101 Created.push_back(CMov.getNode());
25102
25103 // Divide by pow2.
25104 SDValue SRA =
25105 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
25106
25107 // If we're dividing by a positive value, we're done. Otherwise, we must
25108 // negate the result.
25109 if (Divisor.isNonNegative())
25110 return SRA;
25111
25112 Created.push_back(SRA.getNode());
25113 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
25114}
25115
25116/// Result of 'and' is compared against zero. Change to a BT node if possible.
25117/// Returns the BT node and the condition code needed to use it.
25118static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
25119 SelectionDAG &DAG, X86::CondCode &X86CC) {
25120 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25120, __extension__
__PRETTY_FUNCTION__))
;
25121 SDValue Op0 = And.getOperand(0);
25122 SDValue Op1 = And.getOperand(1);
25123 if (Op0.getOpcode() == ISD::TRUNCATE)
25124 Op0 = Op0.getOperand(0);
25125 if (Op1.getOpcode() == ISD::TRUNCATE)
25126 Op1 = Op1.getOperand(0);
25127
25128 SDValue Src, BitNo;
25129 if (Op1.getOpcode() == ISD::SHL)
25130 std::swap(Op0, Op1);
25131 if (Op0.getOpcode() == ISD::SHL) {
25132 if (isOneConstant(Op0.getOperand(0))) {
25133 // If we looked past a truncate, check that it's only truncating away
25134 // known zeros.
25135 unsigned BitWidth = Op0.getValueSizeInBits();
25136 unsigned AndBitWidth = And.getValueSizeInBits();
25137 if (BitWidth > AndBitWidth) {
25138 KnownBits Known = DAG.computeKnownBits(Op0);
25139 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
25140 return SDValue();
25141 }
25142 Src = Op1;
25143 BitNo = Op0.getOperand(1);
25144 }
25145 } else if (Op1.getOpcode() == ISD::Constant) {
25146 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
25147 uint64_t AndRHSVal = AndRHS->getZExtValue();
25148 SDValue AndLHS = Op0;
25149
25150 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
25151 Src = AndLHS.getOperand(0);
25152 BitNo = AndLHS.getOperand(1);
25153 } else {
25154 // Use BT if the immediate can't be encoded in a TEST instruction or we
25155 // are optimizing for size and the immedaite won't fit in a byte.
25156 bool OptForSize = DAG.shouldOptForSize();
25157 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
25158 isPowerOf2_64(AndRHSVal)) {
25159 Src = AndLHS;
25160 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
25161 Src.getValueType());
25162 }
25163 }
25164 }
25165
25166 // No patterns found, give up.
25167 if (!Src.getNode())
25168 return SDValue();
25169
25170 // Remove any bit flip.
25171 if (isBitwiseNot(Src)) {
25172 Src = Src.getOperand(0);
25173 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
25174 }
25175
25176 // Attempt to create the X86ISD::BT node.
25177 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
25178 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25179 return BT;
25180 }
25181
25182 return SDValue();
25183}
25184
25185// Check if pre-AVX condcode can be performed by a single FCMP op.
25186static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
25187 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
25188}
25189
25190/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
25191/// CMPs.
25192static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
25193 SDValue &Op1, bool &IsAlwaysSignaling) {
25194 unsigned SSECC;
25195 bool Swap = false;
25196
25197 // SSE Condition code mapping:
25198 // 0 - EQ
25199 // 1 - LT
25200 // 2 - LE
25201 // 3 - UNORD
25202 // 4 - NEQ
25203 // 5 - NLT
25204 // 6 - NLE
25205 // 7 - ORD
25206 switch (SetCCOpcode) {
25207 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25207)
;
25208 case ISD::SETOEQ:
25209 case ISD::SETEQ: SSECC = 0; break;
25210 case ISD::SETOGT:
25211 case ISD::SETGT: Swap = true; [[fallthrough]];
25212 case ISD::SETLT:
25213 case ISD::SETOLT: SSECC = 1; break;
25214 case ISD::SETOGE:
25215 case ISD::SETGE: Swap = true; [[fallthrough]];
25216 case ISD::SETLE:
25217 case ISD::SETOLE: SSECC = 2; break;
25218 case ISD::SETUO: SSECC = 3; break;
25219 case ISD::SETUNE:
25220 case ISD::SETNE: SSECC = 4; break;
25221 case ISD::SETULE: Swap = true; [[fallthrough]];
25222 case ISD::SETUGE: SSECC = 5; break;
25223 case ISD::SETULT: Swap = true; [[fallthrough]];
25224 case ISD::SETUGT: SSECC = 6; break;
25225 case ISD::SETO: SSECC = 7; break;
25226 case ISD::SETUEQ: SSECC = 8; break;
25227 case ISD::SETONE: SSECC = 12; break;
25228 }
25229 if (Swap)
25230 std::swap(Op0, Op1);
25231
25232 switch (SetCCOpcode) {
25233 default:
25234 IsAlwaysSignaling = true;
25235 break;
25236 case ISD::SETEQ:
25237 case ISD::SETOEQ:
25238 case ISD::SETUEQ:
25239 case ISD::SETNE:
25240 case ISD::SETONE:
25241 case ISD::SETUNE:
25242 case ISD::SETO:
25243 case ISD::SETUO:
25244 IsAlwaysSignaling = false;
25245 break;
25246 }
25247
25248 return SSECC;
25249}
25250
25251/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
25252/// concatenate the result back.
25253static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
25254 ISD::CondCode Cond, SelectionDAG &DAG,
25255 const SDLoc &dl) {
25256 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__
__PRETTY_FUNCTION__))
25257 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__
__PRETTY_FUNCTION__))
;
25258
25259 SDValue CC = DAG.getCondCode(Cond);
25260
25261 // Extract the LHS Lo/Hi vectors
25262 SDValue LHS1, LHS2;
25263 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
25264
25265 // Extract the RHS Lo/Hi vectors
25266 SDValue RHS1, RHS2;
25267 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
25268
25269 // Issue the operation on the smaller types and concatenate the result back
25270 EVT LoVT, HiVT;
25271 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
25272 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25273 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
25274 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
25275}
25276
25277static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
25278
25279 SDValue Op0 = Op.getOperand(0);
25280 SDValue Op1 = Op.getOperand(1);
25281 SDValue CC = Op.getOperand(2);
25282 MVT VT = Op.getSimpleValueType();
25283 SDLoc dl(Op);
25284
25285 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__
__PRETTY_FUNCTION__))
25286 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__
__PRETTY_FUNCTION__))
;
25287
25288 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
25289
25290 // Prefer SETGT over SETLT.
25291 if (SetCCOpcode == ISD::SETLT) {
25292 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
25293 std::swap(Op0, Op1);
25294 }
25295
25296 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
25297}
25298
25299/// Given a buildvector constant, return a new vector constant with each element
25300/// incremented or decremented. If incrementing or decrementing would result in
25301/// unsigned overflow or underflow or this is not a simple vector constant,
25302/// return an empty value.
25303static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,
25304 bool NSW) {
25305 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
25306 if (!BV || !V.getValueType().isSimple())
25307 return SDValue();
25308
25309 MVT VT = V.getSimpleValueType();
25310 MVT EltVT = VT.getVectorElementType();
25311 unsigned NumElts = VT.getVectorNumElements();
25312 SmallVector<SDValue, 8> NewVecC;
25313 SDLoc DL(V);
25314 for (unsigned i = 0; i < NumElts; ++i) {
25315 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
25316 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
25317 return SDValue();
25318
25319 // Avoid overflow/underflow.
25320 const APInt &EltC = Elt->getAPIntValue();
25321 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
25322 return SDValue();
25323 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
25324 (!IsInc && EltC.isMinSignedValue())))
25325 return SDValue();
25326
25327 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
25328 }
25329
25330 return DAG.getBuildVector(VT, DL, NewVecC);
25331}
25332
25333/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
25334/// Op0 u<= Op1:
25335/// t = psubus Op0, Op1
25336/// pcmpeq t, <0..0>
25337static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
25338 ISD::CondCode Cond, const SDLoc &dl,
25339 const X86Subtarget &Subtarget,
25340 SelectionDAG &DAG) {
25341 if (!Subtarget.hasSSE2())
25342 return SDValue();
25343
25344 MVT VET = VT.getVectorElementType();
25345 if (VET != MVT::i8 && VET != MVT::i16)
25346 return SDValue();
25347
25348 switch (Cond) {
25349 default:
25350 return SDValue();
25351 case ISD::SETULT: {
25352 // If the comparison is against a constant we can turn this into a
25353 // setule. With psubus, setule does not require a swap. This is
25354 // beneficial because the constant in the register is no longer
25355 // destructed as the destination so it can be hoisted out of a loop.
25356 // Only do this pre-AVX since vpcmp* is no longer destructive.
25357 if (Subtarget.hasAVX())
25358 return SDValue();
25359 SDValue ULEOp1 =
25360 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
25361 if (!ULEOp1)
25362 return SDValue();
25363 Op1 = ULEOp1;
25364 break;
25365 }
25366 case ISD::SETUGT: {
25367 // If the comparison is against a constant, we can turn this into a setuge.
25368 // This is beneficial because materializing a constant 0 for the PCMPEQ is
25369 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
25370 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
25371 SDValue UGEOp1 =
25372 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
25373 if (!UGEOp1)
25374 return SDValue();
25375 Op1 = Op0;
25376 Op0 = UGEOp1;
25377 break;
25378 }
25379 // Psubus is better than flip-sign because it requires no inversion.
25380 case ISD::SETUGE:
25381 std::swap(Op0, Op1);
25382 break;
25383 case ISD::SETULE:
25384 break;
25385 }
25386
25387 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
25388 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
25389 DAG.getConstant(0, dl, VT));
25390}
25391
25392static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
25393 SelectionDAG &DAG) {
25394 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25395 Op.getOpcode() == ISD::STRICT_FSETCCS;
25396 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25397 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25398 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
25399 MVT VT = Op->getSimpleValueType(0);
25400 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
25401 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
25402 SDLoc dl(Op);
25403
25404 if (isFP) {
25405 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
25406 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25406, __extension__
__PRETTY_FUNCTION__))
;
25407 if (isSoftFP16(EltVT, Subtarget))
25408 return SDValue();
25409
25410 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25411 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25412
25413 // If we have a strict compare with a vXi1 result and the input is 128/256
25414 // bits we can't use a masked compare unless we have VLX. If we use a wider
25415 // compare like we do for non-strict, we might trigger spurious exceptions
25416 // from the upper elements. Instead emit a AVX compare and convert to mask.
25417 unsigned Opc;
25418 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
25419 (!IsStrict || Subtarget.hasVLX() ||
25420 Op0.getSimpleValueType().is512BitVector())) {
25421#ifndef NDEBUG
25422 unsigned Num = VT.getVectorNumElements();
25423 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25423, __extension__
__PRETTY_FUNCTION__))
;
25424#endif
25425 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
25426 } else {
25427 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
25428 // The SSE/AVX packed FP comparison nodes are defined with a
25429 // floating-point vector result that matches the operand type. This allows
25430 // them to work with an SSE1 target (integer vector types are not legal).
25431 VT = Op0.getSimpleValueType();
25432 }
25433
25434 SDValue Cmp;
25435 bool IsAlwaysSignaling;
25436 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
25437 if (!Subtarget.hasAVX()) {
25438 // TODO: We could use following steps to handle a quiet compare with
25439 // signaling encodings.
25440 // 1. Get ordered masks from a quiet ISD::SETO
25441 // 2. Use the masks to mask potential unordered elements in operand A, B
25442 // 3. Get the compare results of masked A, B
25443 // 4. Calculating final result using the mask and result from 3
25444 // But currently, we just fall back to scalar operations.
25445 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
25446 return SDValue();
25447
25448 // Insert an extra signaling instruction to raise exception.
25449 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
25450 SDValue SignalCmp = DAG.getNode(
25451 Opc, dl, {VT, MVT::Other},
25452 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
25453 // FIXME: It seems we need to update the flags of all new strict nodes.
25454 // Otherwise, mayRaiseFPException in MI will return false due to
25455 // NoFPExcept = false by default. However, I didn't find it in other
25456 // patches.
25457 SignalCmp->setFlags(Op->getFlags());
25458 Chain = SignalCmp.getValue(1);
25459 }
25460
25461 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
25462 // emit two comparisons and a logic op to tie them together.
25463 if (!cheapX86FSETCC_SSE(Cond)) {
25464 // LLVM predicate is SETUEQ or SETONE.
25465 unsigned CC0, CC1;
25466 unsigned CombineOpc;
25467 if (Cond == ISD::SETUEQ) {
25468 CC0 = 3; // UNORD
25469 CC1 = 0; // EQ
25470 CombineOpc = X86ISD::FOR;
25471 } else {
25472 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25472, __extension__ __PRETTY_FUNCTION__))
;
25473 CC0 = 7; // ORD
25474 CC1 = 4; // NEQ
25475 CombineOpc = X86ISD::FAND;
25476 }
25477
25478 SDValue Cmp0, Cmp1;
25479 if (IsStrict) {
25480 Cmp0 = DAG.getNode(
25481 Opc, dl, {VT, MVT::Other},
25482 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
25483 Cmp1 = DAG.getNode(
25484 Opc, dl, {VT, MVT::Other},
25485 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
25486 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
25487 Cmp1.getValue(1));
25488 } else {
25489 Cmp0 = DAG.getNode(
25490 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
25491 Cmp1 = DAG.getNode(
25492 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
25493 }
25494 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
25495 } else {
25496 if (IsStrict) {
25497 Cmp = DAG.getNode(
25498 Opc, dl, {VT, MVT::Other},
25499 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25500 Chain = Cmp.getValue(1);
25501 } else
25502 Cmp = DAG.getNode(
25503 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25504 }
25505 } else {
25506 // Handle all other FP comparisons here.
25507 if (IsStrict) {
25508 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
25509 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
25510 Cmp = DAG.getNode(
25511 Opc, dl, {VT, MVT::Other},
25512 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25513 Chain = Cmp.getValue(1);
25514 } else
25515 Cmp = DAG.getNode(
25516 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25517 }
25518
25519 if (VT.getFixedSizeInBits() >
25520 Op.getSimpleValueType().getFixedSizeInBits()) {
25521 // We emitted a compare with an XMM/YMM result. Finish converting to a
25522 // mask register using a vptestm.
25523 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
25524 Cmp = DAG.getBitcast(CastVT, Cmp);
25525 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
25526 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
25527 } else {
25528 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
25529 // the result type of SETCC. The bitcast is expected to be optimized
25530 // away during combining/isel.
25531 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
25532 }
25533
25534 if (IsStrict)
25535 return DAG.getMergeValues({Cmp, Chain}, dl);
25536
25537 return Cmp;
25538 }
25539
25540 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25540, __extension__
__PRETTY_FUNCTION__))
;
25541
25542 MVT VTOp0 = Op0.getSimpleValueType();
25543 (void)VTOp0;
25544 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__
__PRETTY_FUNCTION__))
25545 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__
__PRETTY_FUNCTION__))
;
25546 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__
__PRETTY_FUNCTION__))
25547 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__
__PRETTY_FUNCTION__))
;
25548
25549 // The non-AVX512 code below works under the assumption that source and
25550 // destination types are the same.
25551 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__
__PRETTY_FUNCTION__))
25552 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__
__PRETTY_FUNCTION__))
;
25553
25554 // The result is boolean, but operands are int/float
25555 if (VT.getVectorElementType() == MVT::i1) {
25556 // In AVX-512 architecture setcc returns mask with i1 elements,
25557 // But there is no compare instruction for i8 and i16 elements in KNL.
25558 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__
__PRETTY_FUNCTION__))
25559 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__
__PRETTY_FUNCTION__))
;
25560 return LowerIntVSETCC_AVX512(Op, DAG);
25561 }
25562
25563 // Lower using XOP integer comparisons.
25564 if (VT.is128BitVector() && Subtarget.hasXOP()) {
25565 // Translate compare code to XOP PCOM compare mode.
25566 unsigned CmpMode = 0;
25567 switch (Cond) {
25568 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25568)
;
25569 case ISD::SETULT:
25570 case ISD::SETLT: CmpMode = 0x00; break;
25571 case ISD::SETULE:
25572 case ISD::SETLE: CmpMode = 0x01; break;
25573 case ISD::SETUGT:
25574 case ISD::SETGT: CmpMode = 0x02; break;
25575 case ISD::SETUGE:
25576 case ISD::SETGE: CmpMode = 0x03; break;
25577 case ISD::SETEQ: CmpMode = 0x04; break;
25578 case ISD::SETNE: CmpMode = 0x05; break;
25579 }
25580
25581 // Are we comparing unsigned or signed integers?
25582 unsigned Opc =
25583 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
25584
25585 return DAG.getNode(Opc, dl, VT, Op0, Op1,
25586 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
25587 }
25588
25589 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
25590 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
25591 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
25592 SDValue BC0 = peekThroughBitcasts(Op0);
25593 if (BC0.getOpcode() == ISD::AND) {
25594 APInt UndefElts;
25595 SmallVector<APInt, 64> EltBits;
25596 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
25597 VT.getScalarSizeInBits(), UndefElts,
25598 EltBits, false, false)) {
25599 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
25600 Cond = ISD::SETEQ;
25601 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
25602 }
25603 }
25604 }
25605 }
25606
25607 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
25608 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
25609 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
25610 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
25611 if (C1 && C1->getAPIntValue().isPowerOf2()) {
25612 unsigned BitWidth = VT.getScalarSizeInBits();
25613 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
25614
25615 SDValue Result = Op0.getOperand(0);
25616 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
25617 DAG.getConstant(ShiftAmt, dl, VT));
25618 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
25619 DAG.getConstant(BitWidth - 1, dl, VT));
25620 return Result;
25621 }
25622 }
25623
25624 // Break 256-bit integer vector compare into smaller ones.
25625 if (VT.is256BitVector() && !Subtarget.hasInt256())
25626 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25627
25628 // Break 512-bit integer vector compare into smaller ones.
25629 // TODO: Try harder to use VPCMPx + VPMOV2x?
25630 if (VT.is512BitVector())
25631 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25632
25633 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
25634 // not-of-PCMPEQ:
25635 // X != INT_MIN --> X >s INT_MIN
25636 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
25637 // +X != 0 --> +X >s 0
25638 APInt ConstValue;
25639 if (Cond == ISD::SETNE &&
25640 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
25641 if (ConstValue.isMinSignedValue())
25642 Cond = ISD::SETGT;
25643 else if (ConstValue.isMaxSignedValue())
25644 Cond = ISD::SETLT;
25645 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
25646 Cond = ISD::SETGT;
25647 }
25648
25649 // If both operands are known non-negative, then an unsigned compare is the
25650 // same as a signed compare and there's no need to flip signbits.
25651 // TODO: We could check for more general simplifications here since we're
25652 // computing known bits.
25653 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
25654 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
25655
25656 // Special case: Use min/max operations for unsigned compares.
25657 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25658 if (ISD::isUnsignedIntSetCC(Cond) &&
25659 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
25660 TLI.isOperationLegal(ISD::UMIN, VT)) {
25661 // If we have a constant operand, increment/decrement it and change the
25662 // condition to avoid an invert.
25663 if (Cond == ISD::SETUGT) {
25664 // X > C --> X >= (C+1) --> X == umax(X, C+1)
25665 if (SDValue UGTOp1 =
25666 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
25667 Op1 = UGTOp1;
25668 Cond = ISD::SETUGE;
25669 }
25670 }
25671 if (Cond == ISD::SETULT) {
25672 // X < C --> X <= (C-1) --> X == umin(X, C-1)
25673 if (SDValue ULTOp1 =
25674 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
25675 Op1 = ULTOp1;
25676 Cond = ISD::SETULE;
25677 }
25678 }
25679 bool Invert = false;
25680 unsigned Opc;
25681 switch (Cond) {
25682 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25682)
;
25683 case ISD::SETUGT: Invert = true; [[fallthrough]];
25684 case ISD::SETULE: Opc = ISD::UMIN; break;
25685 case ISD::SETULT: Invert = true; [[fallthrough]];
25686 case ISD::SETUGE: Opc = ISD::UMAX; break;
25687 }
25688
25689 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25690 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
25691
25692 // If the logical-not of the result is required, perform that now.
25693 if (Invert)
25694 Result = DAG.getNOT(dl, Result, VT);
25695
25696 return Result;
25697 }
25698
25699 // Try to use SUBUS and PCMPEQ.
25700 if (FlipSigns)
25701 if (SDValue V =
25702 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
25703 return V;
25704
25705 // We are handling one of the integer comparisons here. Since SSE only has
25706 // GT and EQ comparisons for integer, swapping operands and multiple
25707 // operations may be required for some comparisons.
25708 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
25709 : X86ISD::PCMPGT;
25710 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
25711 Cond == ISD::SETGE || Cond == ISD::SETUGE;
25712 bool Invert = Cond == ISD::SETNE ||
25713 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
25714
25715 if (Swap)
25716 std::swap(Op0, Op1);
25717
25718 // Check that the operation in question is available (most are plain SSE2,
25719 // but PCMPGTQ and PCMPEQQ have different requirements).
25720 if (VT == MVT::v2i64) {
25721 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
25722 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25722, __extension__
__PRETTY_FUNCTION__))
;
25723
25724 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
25725 // the odd elements over the even elements.
25726 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
25727 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
25728 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25729
25730 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25731 static const int MaskHi[] = { 1, 1, 3, 3 };
25732 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25733
25734 return DAG.getBitcast(VT, Result);
25735 }
25736
25737 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
25738 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25739 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
25740
25741 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25742 static const int MaskHi[] = { 1, 1, 3, 3 };
25743 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25744
25745 return DAG.getBitcast(VT, Result);
25746 }
25747
25748 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25749 // bits of the inputs before performing those operations. The lower
25750 // compare is always unsigned.
25751 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
25752 : 0x0000000080000000ULL,
25753 dl, MVT::v2i64);
25754
25755 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
25756 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
25757
25758 // Cast everything to the right type.
25759 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25760 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25761
25762 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
25763 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25764 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
25765
25766 // Create masks for only the low parts/high parts of the 64 bit integers.
25767 static const int MaskHi[] = { 1, 1, 3, 3 };
25768 static const int MaskLo[] = { 0, 0, 2, 2 };
25769 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
25770 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
25771 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25772
25773 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
25774 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
25775
25776 if (Invert)
25777 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25778
25779 return DAG.getBitcast(VT, Result);
25780 }
25781
25782 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
25783 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
25784 // pcmpeqd + pshufd + pand.
25785 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25785, __extension__
__PRETTY_FUNCTION__))
;
25786
25787 // First cast everything to the right type.
25788 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25789 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25790
25791 // Do the compare.
25792 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
25793
25794 // Make sure the lower and upper halves are both all-ones.
25795 static const int Mask[] = { 1, 0, 3, 2 };
25796 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
25797 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
25798
25799 if (Invert)
25800 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25801
25802 return DAG.getBitcast(VT, Result);
25803 }
25804 }
25805
25806 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25807 // bits of the inputs before performing those operations.
25808 if (FlipSigns) {
25809 MVT EltVT = VT.getVectorElementType();
25810 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
25811 VT);
25812 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
25813 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
25814 }
25815
25816 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25817
25818 // If the logical-not of the result is required, perform that now.
25819 if (Invert)
25820 Result = DAG.getNOT(dl, Result, VT);
25821
25822 return Result;
25823}
25824
25825// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
25826static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
25827 const SDLoc &dl, SelectionDAG &DAG,
25828 const X86Subtarget &Subtarget,
25829 SDValue &X86CC) {
25830 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25830, __extension__
__PRETTY_FUNCTION__))
;
25831
25832 // Must be a bitcast from vXi1.
25833 if (Op0.getOpcode() != ISD::BITCAST)
25834 return SDValue();
25835
25836 Op0 = Op0.getOperand(0);
25837 MVT VT = Op0.getSimpleValueType();
25838 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
25839 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
25840 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
25841 return SDValue();
25842
25843 X86::CondCode X86Cond;
25844 if (isNullConstant(Op1)) {
25845 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
25846 } else if (isAllOnesConstant(Op1)) {
25847 // C flag is set for all ones.
25848 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
25849 } else
25850 return SDValue();
25851
25852 // If the input is an AND, we can combine it's operands into the KTEST.
25853 bool KTestable = false;
25854 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
25855 KTestable = true;
25856 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
25857 KTestable = true;
25858 if (!isNullConstant(Op1))
25859 KTestable = false;
25860 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
25861 SDValue LHS = Op0.getOperand(0);
25862 SDValue RHS = Op0.getOperand(1);
25863 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25864 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
25865 }
25866
25867 // If the input is an OR, we can combine it's operands into the KORTEST.
25868 SDValue LHS = Op0;
25869 SDValue RHS = Op0;
25870 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
25871 LHS = Op0.getOperand(0);
25872 RHS = Op0.getOperand(1);
25873 }
25874
25875 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25876 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
25877}
25878
25879/// Emit flags for the given setcc condition and operands. Also returns the
25880/// corresponding X86 condition code constant in X86CC.
25881SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
25882 ISD::CondCode CC, const SDLoc &dl,
25883 SelectionDAG &DAG,
25884 SDValue &X86CC) const {
25885 // Equality Combines.
25886 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
25887 X86::CondCode X86CondCode;
25888
25889 // Optimize to BT if possible.
25890 // Lower (X & (1 << N)) == 0 to BT(X, N).
25891 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
25892 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
25893 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
25894 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
25895 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25896 return BT;
25897 }
25898 }
25899
25900 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
25901 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
25902 X86CondCode)) {
25903 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25904 return CmpZ;
25905 }
25906
25907 // Try to lower using KORTEST or KTEST.
25908 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
25909 return Test;
25910
25911 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
25912 // of these.
25913 if (isOneConstant(Op1) || isNullConstant(Op1)) {
25914 // If the input is a setcc, then reuse the input setcc or use a new one
25915 // with the inverted condition.
25916 if (Op0.getOpcode() == X86ISD::SETCC) {
25917 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
25918
25919 X86CC = Op0.getOperand(0);
25920 if (Invert) {
25921 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
25922 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
25923 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25924 }
25925
25926 return Op0.getOperand(1);
25927 }
25928 }
25929
25930 // Try to use the carry flag from the add in place of an separate CMP for:
25931 // (seteq (add X, -1), -1). Similar for setne.
25932 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
25933 Op0.getOperand(1) == Op1) {
25934 if (isProfitableToUseFlagOp(Op0)) {
25935 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
25936
25937 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
25938 Op0.getOperand(1));
25939 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
25940 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25941 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25942 return SDValue(New.getNode(), 1);
25943 }
25944 }
25945 }
25946
25947 X86::CondCode CondCode =
25948 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
25949 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25949, __extension__
__PRETTY_FUNCTION__))
;
25950
25951 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
25952 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25953 return EFLAGS;
25954}
25955
25956SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
25957
25958 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25959 Op.getOpcode() == ISD::STRICT_FSETCCS;
25960 MVT VT = Op->getSimpleValueType(0);
25961
25962 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
25963
25964 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25964, __extension__
__PRETTY_FUNCTION__))
;
25965 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25966 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25967 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25968 SDLoc dl(Op);
25969 ISD::CondCode CC =
25970 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
25971
25972 if (isSoftFP16(Op0.getValueType()))
25973 return SDValue();
25974
25975 // Handle f128 first, since one possible outcome is a normal integer
25976 // comparison which gets handled by emitFlagsForSetcc.
25977 if (Op0.getValueType() == MVT::f128) {
25978 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
25979 Op.getOpcode() == ISD::STRICT_FSETCCS);
25980
25981 // If softenSetCCOperands returned a scalar, use it.
25982 if (!Op1.getNode()) {
25983 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__
__PRETTY_FUNCTION__))
25984 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__
__PRETTY_FUNCTION__))
;
25985 if (IsStrict)
25986 return DAG.getMergeValues({Op0, Chain}, dl);
25987 return Op0;
25988 }
25989 }
25990
25991 if (Op0.getSimpleValueType().isInteger()) {
25992 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
25993 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
25994 // this may translate to less uops depending on uarch implementation. The
25995 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
25996 // canonicalize to that CondCode.
25997 // NOTE: Only do this if incrementing the constant doesn't increase the bit
25998 // encoding size - so it must either already be a i8 or i32 immediate, or it
25999 // shrinks down to that. We don't do this for any i64's to avoid additional
26000 // constant materializations.
26001 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
26002 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
26003 const APInt &Op1Val = Op1C->getAPIntValue();
26004 if (!Op1Val.isZero()) {
26005 // Ensure the constant+1 doesn't overflow.
26006 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
26007 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
26008 APInt Op1ValPlusOne = Op1Val + 1;
26009 if (Op1ValPlusOne.isSignedIntN(32) &&
26010 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
26011 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
26012 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
26013 : ISD::CondCode::SETUGE;
26014 }
26015 }
26016 }
26017 }
26018
26019 SDValue X86CC;
26020 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
26021 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
26022 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
26023 }
26024
26025 // Handle floating point.
26026 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
26027 if (CondCode == X86::COND_INVALID)
26028 return SDValue();
26029
26030 SDValue EFLAGS;
26031 if (IsStrict) {
26032 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
26033 EFLAGS =
26034 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
26035 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
26036 Chain = EFLAGS.getValue(1);
26037 } else {
26038 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
26039 }
26040
26041 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
26042 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
26043 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
26044}
26045
26046SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
26047 SDValue LHS = Op.getOperand(0);
26048 SDValue RHS = Op.getOperand(1);
26049 SDValue Carry = Op.getOperand(2);
26050 SDValue Cond = Op.getOperand(3);
26051 SDLoc DL(Op);
26052
26053 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26053, __extension__
__PRETTY_FUNCTION__))
;
26054 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
26055
26056 // Recreate the carry if needed.
26057 EVT CarryVT = Carry.getValueType();
26058 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
26059 Carry, DAG.getAllOnesConstant(DL, CarryVT));
26060
26061 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
26062 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
26063 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
26064}
26065
26066// This function returns three things: the arithmetic computation itself
26067// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
26068// flag and the condition code define the case in which the arithmetic
26069// computation overflows.
26070static std::pair<SDValue, SDValue>
26071getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
26072 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26072, __extension__
__PRETTY_FUNCTION__))
;
26073 SDValue Value, Overflow;
26074 SDValue LHS = Op.getOperand(0);
26075 SDValue RHS = Op.getOperand(1);
26076 unsigned BaseOp = 0;
26077 SDLoc DL(Op);
26078 switch (Op.getOpcode()) {
26079 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 26079)
;
26080 case ISD::SADDO:
26081 BaseOp = X86ISD::ADD;
26082 Cond = X86::COND_O;
26083 break;
26084 case ISD::UADDO:
26085 BaseOp = X86ISD::ADD;
26086 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
26087 break;
26088 case ISD::SSUBO:
26089 BaseOp = X86ISD::SUB;
26090 Cond = X86::COND_O;
26091 break;
26092 case ISD::USUBO:
26093 BaseOp = X86ISD::SUB;
26094 Cond = X86::COND_B;
26095 break;
26096 case ISD::SMULO:
26097 BaseOp = X86ISD::SMUL;
26098 Cond = X86::COND_O;
26099 break;
26100 case ISD::UMULO:
26101 BaseOp = X86ISD::UMUL;
26102 Cond = X86::COND_O;
26103 break;
26104 }
26105
26106 if (BaseOp) {
26107 // Also sets EFLAGS.
26108 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
26109 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
26110 Overflow = Value.getValue(1);
26111 }
26112
26113 return std::make_pair(Value, Overflow);
26114}
26115
26116static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
26117 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
26118 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
26119 // looks for this combo and may remove the "setcc" instruction if the "setcc"
26120 // has only one use.
26121 SDLoc DL(Op);
26122 X86::CondCode Cond;
26123 SDValue Value, Overflow;
26124 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
26125
26126 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
26127 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26127, __extension__
__PRETTY_FUNCTION__))
;
26128 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
26129}
26130
26131/// Return true if opcode is a X86 logical comparison.
26132static bool isX86LogicalCmp(SDValue Op) {
26133 unsigned Opc = Op.getOpcode();
26134 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
26135 Opc == X86ISD::FCMP)
26136 return true;
26137 if (Op.getResNo() == 1 &&
26138 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
26139 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
26140 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
26141 return true;
26142
26143 return false;
26144}
26145
26146static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
26147 if (V.getOpcode() != ISD::TRUNCATE)
26148 return false;
26149
26150 SDValue VOp0 = V.getOperand(0);
26151 unsigned InBits = VOp0.getValueSizeInBits();
26152 unsigned Bits = V.getValueSizeInBits();
26153 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
26154}
26155
26156SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
26157 bool AddTest = true;
26158 SDValue Cond = Op.getOperand(0);
26159 SDValue Op1 = Op.getOperand(1);
26160 SDValue Op2 = Op.getOperand(2);
26161 SDLoc DL(Op);
26162 MVT VT = Op1.getSimpleValueType();
26163 SDValue CC;
26164
26165 if (isSoftFP16(VT)) {
26166 MVT NVT = VT.changeTypeToInteger();
26167 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
26168 DAG.getBitcast(NVT, Op1),
26169 DAG.getBitcast(NVT, Op2)));
26170 }
26171
26172 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
26173 // are available or VBLENDV if AVX is available.
26174 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
26175 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
26176 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
26177 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
26178 bool IsAlwaysSignaling;
26179 unsigned SSECC =
26180 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
26181 CondOp0, CondOp1, IsAlwaysSignaling);
26182
26183 if (Subtarget.hasAVX512()) {
26184 SDValue Cmp =
26185 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
26186 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26187 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26187, __extension__
__PRETTY_FUNCTION__))
;
26188 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26189 }
26190
26191 if (SSECC < 8 || Subtarget.hasAVX()) {
26192 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
26193 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26194
26195 // If we have AVX, we can use a variable vector select (VBLENDV) instead
26196 // of 3 logic instructions for size savings and potentially speed.
26197 // Unfortunately, there is no scalar form of VBLENDV.
26198
26199 // If either operand is a +0.0 constant, don't try this. We can expect to
26200 // optimize away at least one of the logic instructions later in that
26201 // case, so that sequence would be faster than a variable blend.
26202
26203 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
26204 // uses XMM0 as the selection register. That may need just as many
26205 // instructions as the AND/ANDN/OR sequence due to register moves, so
26206 // don't bother.
26207 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
26208 !isNullFPConstant(Op2)) {
26209 // Convert to vectors, do a VSELECT, and convert back to scalar.
26210 // All of the conversions should be optimized away.
26211 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
26212 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
26213 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
26214 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
26215
26216 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
26217 VCmp = DAG.getBitcast(VCmpVT, VCmp);
26218
26219 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
26220
26221 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
26222 VSel, DAG.getIntPtrConstant(0, DL));
26223 }
26224 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
26225 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
26226 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
26227 }
26228 }
26229
26230 // AVX512 fallback is to lower selects of scalar floats to masked moves.
26231 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
26232 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
26233 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26234 }
26235
26236 if (Cond.getOpcode() == ISD::SETCC &&
26237 !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
26238 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
26239 Cond = NewCond;
26240 // If the condition was updated, it's possible that the operands of the
26241 // select were also updated (for example, EmitTest has a RAUW). Refresh
26242 // the local references to the select operands in case they got stale.
26243 Op1 = Op.getOperand(1);
26244 Op2 = Op.getOperand(2);
26245 }
26246 }
26247
26248 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
26249 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
26250 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
26251 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
26252 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
26253 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
26254 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26255 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26256 if (Cond.getOpcode() == X86ISD::SETCC &&
26257 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
26258 isNullConstant(Cond.getOperand(1).getOperand(1))) {
26259 SDValue Cmp = Cond.getOperand(1);
26260 SDValue CmpOp0 = Cmp.getOperand(0);
26261 unsigned CondCode = Cond.getConstantOperandVal(0);
26262
26263 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
26264 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
26265 // handle to keep the CMP with 0. This should be removed by
26266 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
26267 // cttz_zero_undef.
26268 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
26269 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
26270 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
26271 };
26272 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
26273 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
26274 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
26275 // Keep Cmp.
26276 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26277 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
26278 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
26279 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
26280
26281 // 'X - 1' sets the carry flag if X == 0.
26282 // '0 - X' sets the carry flag if X != 0.
26283 // Convert the carry flag to a -1/0 mask with sbb:
26284 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
26285 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
26286 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
26287 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
26288 SDValue Sub;
26289 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
26290 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
26291 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
26292 } else {
26293 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
26294 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
26295 }
26296 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26297 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
26298 Sub.getValue(1));
26299 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
26300 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
26301 CmpOp0.getOpcode() == ISD::AND &&
26302 isOneConstant(CmpOp0.getOperand(1))) {
26303 SDValue Src1, Src2;
26304 // true if Op2 is XOR or OR operator and one of its operands
26305 // is equal to Op1
26306 // ( a , a op b) || ( b , a op b)
26307 auto isOrXorPattern = [&]() {
26308 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
26309 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
26310 Src1 =
26311 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
26312 Src2 = Op1;
26313 return true;
26314 }
26315 return false;
26316 };
26317
26318 if (isOrXorPattern()) {
26319 SDValue Neg;
26320 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
26321 // we need mask of all zeros or ones with same size of the other
26322 // operands.
26323 if (CmpSz > VT.getSizeInBits())
26324 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
26325 else if (CmpSz < VT.getSizeInBits())
26326 Neg = DAG.getNode(ISD::AND, DL, VT,
26327 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
26328 DAG.getConstant(1, DL, VT));
26329 else
26330 Neg = CmpOp0;
26331 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
26332 Neg); // -(and (x, 0x1))
26333 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
26334 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
26335 }
26336 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
26337 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
26338 ((CondCode == X86::COND_S) || // smin(x, 0)
26339 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
26340 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26341 //
26342 // If the comparison is testing for a positive value, we have to invert
26343 // the sign bit mask, so only do that transform if the target has a
26344 // bitwise 'and not' instruction (the invert is free).
26345 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26346 unsigned ShCt = VT.getSizeInBits() - 1;
26347 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
26348 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
26349 if (CondCode == X86::COND_G)
26350 Shift = DAG.getNOT(DL, Shift, VT);
26351 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
26352 }
26353 }
26354
26355 // Look past (and (setcc_carry (cmp ...)), 1).
26356 if (Cond.getOpcode() == ISD::AND &&
26357 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
26358 isOneConstant(Cond.getOperand(1)))
26359 Cond = Cond.getOperand(0);
26360
26361 // If condition flag is set by a X86ISD::CMP, then use it as the condition
26362 // setting operand in place of the X86ISD::SETCC.
26363 unsigned CondOpcode = Cond.getOpcode();
26364 if (CondOpcode == X86ISD::SETCC ||
26365 CondOpcode == X86ISD::SETCC_CARRY) {
26366 CC = Cond.getOperand(0);
26367
26368 SDValue Cmp = Cond.getOperand(1);
26369 bool IllegalFPCMov = false;
26370 if (VT.isFloatingPoint() && !VT.isVector() &&
26371 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
26372 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
26373
26374 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
26375 Cmp.getOpcode() == X86ISD::BT) { // FIXME
26376 Cond = Cmp;
26377 AddTest = false;
26378 }
26379 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
26380 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
26381 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
26382 SDValue Value;
26383 X86::CondCode X86Cond;
26384 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26385
26386 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
26387 AddTest = false;
26388 }
26389
26390 if (AddTest) {
26391 // Look past the truncate if the high bits are known zero.
26392 if (isTruncWithZeroHighBitsInput(Cond, DAG))
26393 Cond = Cond.getOperand(0);
26394
26395 // We know the result of AND is compared against zero. Try to match
26396 // it to BT.
26397 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
26398 X86::CondCode X86CondCode;
26399 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
26400 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
26401 Cond = BT;
26402 AddTest = false;
26403 }
26404 }
26405 }
26406
26407 if (AddTest) {
26408 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
26409 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
26410 }
26411
26412 // a < b ? -1 : 0 -> RES = ~setcc_carry
26413 // a < b ? 0 : -1 -> RES = setcc_carry
26414 // a >= b ? -1 : 0 -> RES = setcc_carry
26415 // a >= b ? 0 : -1 -> RES = ~setcc_carry
26416 if (Cond.getOpcode() == X86ISD::SUB) {
26417 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
26418
26419 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
26420 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26421 (isNullConstant(Op1) || isNullConstant(Op2))) {
26422 SDValue Res =
26423 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
26424 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
26425 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
26426 return DAG.getNOT(DL, Res, Res.getValueType());
26427 return Res;
26428 }
26429 }
26430
26431 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
26432 // widen the cmov and push the truncate through. This avoids introducing a new
26433 // branch during isel and doesn't add any extensions.
26434 if (Op.getValueType() == MVT::i8 &&
26435 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
26436 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
26437 if (T1.getValueType() == T2.getValueType() &&
26438 // Exclude CopyFromReg to avoid partial register stalls.
26439 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
26440 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
26441 CC, Cond);
26442 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26443 }
26444 }
26445
26446 // Or finally, promote i8 cmovs if we have CMOV,
26447 // or i16 cmovs if it won't prevent folding a load.
26448 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
26449 // legal, but EmitLoweredSelect() can not deal with these extensions
26450 // being inserted between two CMOV's. (in i16 case too TBN)
26451 // https://bugs.llvm.org/show_bug.cgi?id=40974
26452 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
26453 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
26454 !X86::mayFoldLoad(Op2, Subtarget))) {
26455 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
26456 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
26457 SDValue Ops[] = { Op2, Op1, CC, Cond };
26458 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
26459 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26460 }
26461
26462 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
26463 // condition is true.
26464 SDValue Ops[] = { Op2, Op1, CC, Cond };
26465 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
26466}
26467
26468static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
26469 const X86Subtarget &Subtarget,
26470 SelectionDAG &DAG) {
26471 MVT VT = Op->getSimpleValueType(0);
26472 SDValue In = Op->getOperand(0);
26473 MVT InVT = In.getSimpleValueType();
26474 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26474, __extension__
__PRETTY_FUNCTION__))
;
26475 MVT VTElt = VT.getVectorElementType();
26476 SDLoc dl(Op);
26477
26478 unsigned NumElts = VT.getVectorNumElements();
26479
26480 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
26481 MVT ExtVT = VT;
26482 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
26483 // If v16i32 is to be avoided, we'll need to split and concatenate.
26484 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
26485 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
26486
26487 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
26488 }
26489
26490 // Widen to 512-bits if VLX is not supported.
26491 MVT WideVT = ExtVT;
26492 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
26493 NumElts *= 512 / ExtVT.getSizeInBits();
26494 InVT = MVT::getVectorVT(MVT::i1, NumElts);
26495 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
26496 In, DAG.getIntPtrConstant(0, dl));
26497 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
26498 }
26499
26500 SDValue V;
26501 MVT WideEltVT = WideVT.getVectorElementType();
26502 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
26503 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
26504 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
26505 } else {
26506 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
26507 SDValue Zero = DAG.getConstant(0, dl, WideVT);
26508 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
26509 }
26510
26511 // Truncate if we had to extend i16/i8 above.
26512 if (VT != ExtVT) {
26513 WideVT = MVT::getVectorVT(VTElt, NumElts);
26514 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
26515 }
26516
26517 // Extract back to 128/256-bit if we widened.
26518 if (WideVT != VT)
26519 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
26520 DAG.getIntPtrConstant(0, dl));
26521
26522 return V;
26523}
26524
26525static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26526 SelectionDAG &DAG) {
26527 SDValue In = Op->getOperand(0);
26528 MVT InVT = In.getSimpleValueType();
26529
26530 if (InVT.getVectorElementType() == MVT::i1)
26531 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26532
26533 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26533, __extension__
__PRETTY_FUNCTION__))
;
26534 return LowerAVXExtend(Op, DAG, Subtarget);
26535}
26536
26537// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
26538// For sign extend this needs to handle all vector sizes and SSE4.1 and
26539// non-SSE4.1 targets. For zero extend this should only handle inputs of
26540// MVT::v64i8 when BWI is not supported, but AVX512 is.
26541static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
26542 const X86Subtarget &Subtarget,
26543 SelectionDAG &DAG) {
26544 SDValue In = Op->getOperand(0);
26545 MVT VT = Op->getSimpleValueType(0);
26546 MVT InVT = In.getSimpleValueType();
26547
26548 MVT SVT = VT.getVectorElementType();
26549 MVT InSVT = InVT.getVectorElementType();
26550 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26550, __extension__
__PRETTY_FUNCTION__))
;
26551
26552 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
26553 return SDValue();
26554 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
26555 return SDValue();
26556 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
26557 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
26558 !(VT.is512BitVector() && Subtarget.hasAVX512()))
26559 return SDValue();
26560
26561 SDLoc dl(Op);
26562 unsigned Opc = Op.getOpcode();
26563 unsigned NumElts = VT.getVectorNumElements();
26564
26565 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
26566 // For 512-bit vectors, we need 128-bits or 256-bits.
26567 if (InVT.getSizeInBits() > 128) {
26568 // Input needs to be at least the same number of elements as output, and
26569 // at least 128-bits.
26570 int InSize = InSVT.getSizeInBits() * NumElts;
26571 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
26572 InVT = In.getSimpleValueType();
26573 }
26574
26575 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
26576 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
26577 // need to be handled here for 256/512-bit results.
26578 if (Subtarget.hasInt256()) {
26579 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26579, __extension__
__PRETTY_FUNCTION__))
;
26580
26581 if (InVT.getVectorNumElements() != NumElts)
26582 return DAG.getNode(Op.getOpcode(), dl, VT, In);
26583
26584 // FIXME: Apparently we create inreg operations that could be regular
26585 // extends.
26586 unsigned ExtOpc =
26587 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
26588 : ISD::ZERO_EXTEND;
26589 return DAG.getNode(ExtOpc, dl, VT, In);
26590 }
26591
26592 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
26593 if (Subtarget.hasAVX()) {
26594 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26594, __extension__
__PRETTY_FUNCTION__))
;
26595 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26596 int HalfNumElts = HalfVT.getVectorNumElements();
26597
26598 unsigned NumSrcElts = InVT.getVectorNumElements();
26599 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
26600 for (int i = 0; i != HalfNumElts; ++i)
26601 HiMask[i] = HalfNumElts + i;
26602
26603 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
26604 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
26605 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
26606 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26607 }
26608
26609 // We should only get here for sign extend.
26610 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26610, __extension__
__PRETTY_FUNCTION__))
;
26611 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26611, __extension__
__PRETTY_FUNCTION__))
;
26612
26613 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
26614 SDValue Curr = In;
26615 SDValue SignExt = Curr;
26616
26617 // As SRAI is only available on i16/i32 types, we expand only up to i32
26618 // and handle i64 separately.
26619 if (InVT != MVT::v4i32) {
26620 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
26621
26622 unsigned DestWidth = DestVT.getScalarSizeInBits();
26623 unsigned Scale = DestWidth / InSVT.getSizeInBits();
26624
26625 unsigned InNumElts = InVT.getVectorNumElements();
26626 unsigned DestElts = DestVT.getVectorNumElements();
26627
26628 // Build a shuffle mask that takes each input element and places it in the
26629 // MSBs of the new element size.
26630 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
26631 for (unsigned i = 0; i != DestElts; ++i)
26632 Mask[i * Scale + (Scale - 1)] = i;
26633
26634 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
26635 Curr = DAG.getBitcast(DestVT, Curr);
26636
26637 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
26638 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
26639 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
26640 }
26641
26642 if (VT == MVT::v2i64) {
26643 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__
__PRETTY_FUNCTION__))
;
26644 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
26645 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
26646 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
26647 SignExt = DAG.getBitcast(VT, SignExt);
26648 }
26649
26650 return SignExt;
26651}
26652
26653static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26654 SelectionDAG &DAG) {
26655 MVT VT = Op->getSimpleValueType(0);
26656 SDValue In = Op->getOperand(0);
26657 MVT InVT = In.getSimpleValueType();
26658 SDLoc dl(Op);
26659
26660 if (InVT.getVectorElementType() == MVT::i1)
26661 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26662
26663 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26663, __extension__
__PRETTY_FUNCTION__))
;
26664 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__
__PRETTY_FUNCTION__))
26665 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__
__PRETTY_FUNCTION__))
;
26666 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
26667 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
26668 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
26669 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))
;
26670 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
26671 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
26672 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
26673 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))
;
26674
26675 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
26676 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26676, __extension__
__PRETTY_FUNCTION__))
;
26677 return splitVectorIntUnary(Op, DAG);
26678 }
26679
26680 if (Subtarget.hasInt256())
26681 return Op;
26682
26683 // Optimize vectors in AVX mode
26684 // Sign extend v8i16 to v8i32 and
26685 // v4i32 to v4i64
26686 //
26687 // Divide input vector into two parts
26688 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
26689 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
26690 // concat the vectors to original VT
26691 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26692 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
26693
26694 unsigned NumElems = InVT.getVectorNumElements();
26695 SmallVector<int,8> ShufMask(NumElems, -1);
26696 for (unsigned i = 0; i != NumElems/2; ++i)
26697 ShufMask[i] = i + NumElems/2;
26698
26699 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26700 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
26701
26702 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
26703}
26704
26705/// Change a vector store into a pair of half-size vector stores.
26706static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
26707 SDValue StoredVal = Store->getValue();
26708 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))
26709 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))
26710 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))
;
26711
26712 // Splitting volatile memory ops is not allowed unless the operation was not
26713 // legal to begin with. Assume the input store is legal (this transform is
26714 // only used for targets with AVX). Note: It is possible that we have an
26715 // illegal type like v2i128, and so we could allow splitting a volatile store
26716 // in that case if that is important.
26717 if (!Store->isSimple())
26718 return SDValue();
26719
26720 SDLoc DL(Store);
26721 SDValue Value0, Value1;
26722 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
26723 unsigned HalfOffset = Value0.getValueType().getStoreSize();
26724 SDValue Ptr0 = Store->getBasePtr();
26725 SDValue Ptr1 =
26726 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
26727 SDValue Ch0 =
26728 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
26729 Store->getOriginalAlign(),
26730 Store->getMemOperand()->getFlags());
26731 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
26732 Store->getPointerInfo().getWithOffset(HalfOffset),
26733 Store->getOriginalAlign(),
26734 Store->getMemOperand()->getFlags());
26735 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
26736}
26737
26738/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
26739/// type.
26740static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
26741 SelectionDAG &DAG) {
26742 SDValue StoredVal = Store->getValue();
26743 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__
__PRETTY_FUNCTION__))
26744 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__
__PRETTY_FUNCTION__))
;
26745 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
26746
26747 // Splitting volatile memory ops is not allowed unless the operation was not
26748 // legal to begin with. We are assuming the input op is legal (this transform
26749 // is only used for targets with AVX).
26750 if (!Store->isSimple())
26751 return SDValue();
26752
26753 MVT StoreSVT = StoreVT.getScalarType();
26754 unsigned NumElems = StoreVT.getVectorNumElements();
26755 unsigned ScalarSize = StoreSVT.getStoreSize();
26756
26757 SDLoc DL(Store);
26758 SmallVector<SDValue, 4> Stores;
26759 for (unsigned i = 0; i != NumElems; ++i) {
26760 unsigned Offset = i * ScalarSize;
26761 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
26762 TypeSize::Fixed(Offset), DL);
26763 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
26764 DAG.getIntPtrConstant(i, DL));
26765 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
26766 Store->getPointerInfo().getWithOffset(Offset),
26767 Store->getOriginalAlign(),
26768 Store->getMemOperand()->getFlags());
26769 Stores.push_back(Ch);
26770 }
26771 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
26772}
26773
26774static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
26775 SelectionDAG &DAG) {
26776 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
26777 SDLoc dl(St);
26778 SDValue StoredVal = St->getValue();
26779
26780 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
26781 if (StoredVal.getValueType().isVector() &&
26782 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
26783 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
26784 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26784, __extension__
__PRETTY_FUNCTION__))
;
26785 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26785, __extension__
__PRETTY_FUNCTION__))
;
26786 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__
__PRETTY_FUNCTION__))
26787 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__
__PRETTY_FUNCTION__))
;
26788
26789 // We must pad with zeros to ensure we store zeroes to any unused bits.
26790 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26791 DAG.getUNDEF(MVT::v16i1), StoredVal,
26792 DAG.getIntPtrConstant(0, dl));
26793 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
26794 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
26795 // Make sure we store zeros in the extra bits.
26796 if (NumElts < 8)
26797 StoredVal = DAG.getZeroExtendInReg(
26798 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
26799
26800 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26801 St->getPointerInfo(), St->getOriginalAlign(),
26802 St->getMemOperand()->getFlags());
26803 }
26804
26805 if (St->isTruncatingStore())
26806 return SDValue();
26807
26808 // If this is a 256-bit store of concatenated ops, we are better off splitting
26809 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
26810 // and each half can execute independently. Some cores would split the op into
26811 // halves anyway, so the concat (vinsertf128) is purely an extra op.
26812 MVT StoreVT = StoredVal.getSimpleValueType();
26813 if (StoreVT.is256BitVector() ||
26814 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
26815 !Subtarget.hasBWI())) {
26816 SmallVector<SDValue, 4> CatOps;
26817 if (StoredVal.hasOneUse() &&
26818 collectConcatOps(StoredVal.getNode(), CatOps, DAG))
26819 return splitVectorStore(St, DAG);
26820 return SDValue();
26821 }
26822
26823 if (StoreVT.is32BitVector())
26824 return SDValue();
26825
26826 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26827 assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26827, __extension__
__PRETTY_FUNCTION__))
;
26828 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))
26829 TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))
26830 "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))
;
26831
26832 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
26833 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
26834 DAG.getUNDEF(StoreVT));
26835
26836 if (Subtarget.hasSSE2()) {
26837 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
26838 // and store it.
26839 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
26840 MVT CastVT = MVT::getVectorVT(StVT, 2);
26841 StoredVal = DAG.getBitcast(CastVT, StoredVal);
26842 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
26843 DAG.getIntPtrConstant(0, dl));
26844
26845 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26846 St->getPointerInfo(), St->getOriginalAlign(),
26847 St->getMemOperand()->getFlags());
26848 }
26849 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26849, __extension__
__PRETTY_FUNCTION__))
;
26850 SDVTList Tys = DAG.getVTList(MVT::Other);
26851 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
26852 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
26853 St->getMemOperand());
26854}
26855
26856// Lower vector extended loads using a shuffle. If SSSE3 is not available we
26857// may emit an illegal shuffle but the expansion is still better than scalar
26858// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
26859// we'll emit a shuffle and a arithmetic shift.
26860// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
26861// TODO: It is possible to support ZExt by zeroing the undef values during
26862// the shuffle phase or after the shuffle.
26863static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
26864 SelectionDAG &DAG) {
26865 MVT RegVT = Op.getSimpleValueType();
26866 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26866, __extension__
__PRETTY_FUNCTION__))
;
26867 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__
__PRETTY_FUNCTION__))
26868 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__
__PRETTY_FUNCTION__))
;
26869
26870 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
26871 SDLoc dl(Ld);
26872
26873 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
26874 if (RegVT.getVectorElementType() == MVT::i1) {
26875 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26875, __extension__
__PRETTY_FUNCTION__))
;
26876 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26876, __extension__
__PRETTY_FUNCTION__))
;
26877 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__
__PRETTY_FUNCTION__))
26878 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__
__PRETTY_FUNCTION__))
;
26879
26880 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
26881 Ld->getPointerInfo(), Ld->getOriginalAlign(),
26882 Ld->getMemOperand()->getFlags());
26883
26884 // Replace chain users with the new chain.
26885 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26885, __extension__
__PRETTY_FUNCTION__))
;
26886
26887 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
26888 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
26889 DAG.getBitcast(MVT::v16i1, Val),
26890 DAG.getIntPtrConstant(0, dl));
26891 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
26892 }
26893
26894 return SDValue();
26895}
26896
26897/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
26898/// each of which has no other use apart from the AND / OR.
26899static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
26900 Opc = Op.getOpcode();
26901 if (Opc != ISD::OR && Opc != ISD::AND)
26902 return false;
26903 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
26904 Op.getOperand(0).hasOneUse() &&
26905 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
26906 Op.getOperand(1).hasOneUse());
26907}
26908
26909SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
26910 SDValue Chain = Op.getOperand(0);
26911 SDValue Cond = Op.getOperand(1);
26912 SDValue Dest = Op.getOperand(2);
26913 SDLoc dl(Op);
26914
26915 // Bail out when we don't have native compare instructions.
26916 if (Cond.getOpcode() == ISD::SETCC &&
26917 Cond.getOperand(0).getValueType() != MVT::f128 &&
26918 !isSoftFP16(Cond.getOperand(0).getValueType())) {
26919 SDValue LHS = Cond.getOperand(0);
26920 SDValue RHS = Cond.getOperand(1);
26921 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26922
26923 // Special case for
26924 // setcc([su]{add,sub,mul}o == 0)
26925 // setcc([su]{add,sub,mul}o != 1)
26926 if (ISD::isOverflowIntrOpRes(LHS) &&
26927 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
26928 (isNullConstant(RHS) || isOneConstant(RHS))) {
26929 SDValue Value, Overflow;
26930 X86::CondCode X86Cond;
26931 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
26932
26933 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
26934 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
26935
26936 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26937 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26938 Overflow);
26939 }
26940
26941 if (LHS.getSimpleValueType().isInteger()) {
26942 SDValue CCVal;
26943 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
26944 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26945 EFLAGS);
26946 }
26947
26948 if (CC == ISD::SETOEQ) {
26949 // For FCMP_OEQ, we can emit
26950 // two branches instead of an explicit AND instruction with a
26951 // separate test. However, we only do this if this block doesn't
26952 // have a fall-through edge, because this requires an explicit
26953 // jmp when the condition is false.
26954 if (Op.getNode()->hasOneUse()) {
26955 SDNode *User = *Op.getNode()->use_begin();
26956 // Look for an unconditional branch following this conditional branch.
26957 // We need this because we need to reverse the successors in order
26958 // to implement FCMP_OEQ.
26959 if (User->getOpcode() == ISD::BR) {
26960 SDValue FalseBB = User->getOperand(1);
26961 SDNode *NewBR =
26962 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
26963 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26963, __extension__ __PRETTY_FUNCTION__))
;
26964 (void)NewBR;
26965 Dest = FalseBB;
26966
26967 SDValue Cmp =
26968 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26969 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26970 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
26971 CCVal, Cmp);
26972 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26973 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26974 Cmp);
26975 }
26976 }
26977 } else if (CC == ISD::SETUNE) {
26978 // For FCMP_UNE, we can emit
26979 // two branches instead of an explicit OR instruction with a
26980 // separate test.
26981 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26982 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26983 Chain =
26984 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
26985 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26986 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26987 Cmp);
26988 } else {
26989 X86::CondCode X86Cond =
26990 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
26991 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26992 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26993 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26994 Cmp);
26995 }
26996 }
26997
26998 if (ISD::isOverflowIntrOpRes(Cond)) {
26999 SDValue Value, Overflow;
27000 X86::CondCode X86Cond;
27001 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
27002
27003 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
27004 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27005 Overflow);
27006 }
27007
27008 // Look past the truncate if the high bits are known zero.
27009 if (isTruncWithZeroHighBitsInput(Cond, DAG))
27010 Cond = Cond.getOperand(0);
27011
27012 EVT CondVT = Cond.getValueType();
27013
27014 // Add an AND with 1 if we don't already have one.
27015 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
27016 Cond =
27017 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
27018
27019 SDValue LHS = Cond;
27020 SDValue RHS = DAG.getConstant(0, dl, CondVT);
27021
27022 SDValue CCVal;
27023 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
27024 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27025 EFLAGS);
27026}
27027
27028// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
27029// Calls to _alloca are needed to probe the stack when allocating more than 4k
27030// bytes in one go. Touching the stack at 4K increments is necessary to ensure
27031// that the guard pages used by the OS virtual memory manager are allocated in
27032// correct sequence.
27033SDValue
27034X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
27035 SelectionDAG &DAG) const {
27036 MachineFunction &MF = DAG.getMachineFunction();
27037 bool SplitStack = MF.shouldSplitStack();
27038 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
27039 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
27040 SplitStack || EmitStackProbeCall;
27041 SDLoc dl(Op);
27042
27043 // Get the inputs.
27044 SDNode *Node = Op.getNode();
27045 SDValue Chain = Op.getOperand(0);
27046 SDValue Size = Op.getOperand(1);
27047 MaybeAlign Alignment(Op.getConstantOperandVal(2));
27048 EVT VT = Node->getValueType(0);
27049
27050 // Chain the dynamic stack allocation so that it doesn't modify the stack
27051 // pointer when other instructions are using the stack.
27052 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
27053
27054 bool Is64Bit = Subtarget.is64Bit();
27055 MVT SPTy = getPointerTy(DAG.getDataLayout());
27056
27057 SDValue Result;
27058 if (!Lower) {
27059 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27060 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
27061 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__
__PRETTY_FUNCTION__))
27062 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__
__PRETTY_FUNCTION__))
;
27063
27064 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27065 const Align StackAlign = TFI.getStackAlign();
27066 if (hasInlineStackProbe(MF)) {
27067 MachineRegisterInfo &MRI = MF.getRegInfo();
27068
27069 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27070 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27071 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27072 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
27073 DAG.getRegister(Vreg, SPTy));
27074 } else {
27075 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
27076 Chain = SP.getValue(1);
27077 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
27078 }
27079 if (Alignment && *Alignment > StackAlign)
27080 Result =
27081 DAG.getNode(ISD::AND, dl, VT, Result,
27082 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27083 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
27084 } else if (SplitStack) {
27085 MachineRegisterInfo &MRI = MF.getRegInfo();
27086
27087 if (Is64Bit) {
27088 // The 64 bit implementation of segmented stacks needs to clobber both r10
27089 // r11. This makes it impossible to use it along with nested parameters.
27090 const Function &F = MF.getFunction();
27091 for (const auto &A : F.args()) {
27092 if (A.hasNestAttr())
27093 report_fatal_error("Cannot use segmented stacks with functions that "
27094 "have nested arguments.");
27095 }
27096 }
27097
27098 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27099 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27100 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27101 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
27102 DAG.getRegister(Vreg, SPTy));
27103 } else {
27104 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
27105 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
27106 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
27107
27108 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27109 Register SPReg = RegInfo->getStackRegister();
27110 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
27111 Chain = SP.getValue(1);
27112
27113 if (Alignment) {
27114 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
27115 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27116 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
27117 }
27118
27119 Result = SP;
27120 }
27121
27122 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
27123
27124 SDValue Ops[2] = {Result, Chain};
27125 return DAG.getMergeValues(Ops, dl);
27126}
27127
27128SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
27129 MachineFunction &MF = DAG.getMachineFunction();
27130 auto PtrVT = getPointerTy(MF.getDataLayout());
27131 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27132
27133 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27134 SDLoc DL(Op);
27135
27136 if (!Subtarget.is64Bit() ||
27137 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
27138 // vastart just stores the address of the VarArgsFrameIndex slot into the
27139 // memory location argument.
27140 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27141 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
27142 MachinePointerInfo(SV));
27143 }
27144
27145 // __va_list_tag:
27146 // gp_offset (0 - 6 * 8)
27147 // fp_offset (48 - 48 + 8 * 16)
27148 // overflow_arg_area (point to parameters coming in memory).
27149 // reg_save_area
27150 SmallVector<SDValue, 8> MemOps;
27151 SDValue FIN = Op.getOperand(1);
27152 // Store gp_offset
27153 SDValue Store = DAG.getStore(
27154 Op.getOperand(0), DL,
27155 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
27156 MachinePointerInfo(SV));
27157 MemOps.push_back(Store);
27158
27159 // Store fp_offset
27160 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
27161 Store = DAG.getStore(
27162 Op.getOperand(0), DL,
27163 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
27164 MachinePointerInfo(SV, 4));
27165 MemOps.push_back(Store);
27166
27167 // Store ptr to overflow_arg_area
27168 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
27169 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27170 Store =
27171 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
27172 MemOps.push_back(Store);
27173
27174 // Store ptr to reg_save_area.
27175 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
27176 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
27177 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
27178 Store = DAG.getStore(
27179 Op.getOperand(0), DL, RSFIN, FIN,
27180 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
27181 MemOps.push_back(Store);
27182 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
27183}
27184
27185SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
27186 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__
__PRETTY_FUNCTION__))
27187 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__
__PRETTY_FUNCTION__))
;
27188 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27188, __extension__ __PRETTY_FUNCTION__))
;
27189
27190 MachineFunction &MF = DAG.getMachineFunction();
27191 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
27192 // The Win64 ABI uses char* instead of a structure.
27193 return DAG.expandVAArg(Op.getNode());
27194
27195 SDValue Chain = Op.getOperand(0);
27196 SDValue SrcPtr = Op.getOperand(1);
27197 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27198 unsigned Align = Op.getConstantOperandVal(3);
27199 SDLoc dl(Op);
27200
27201 EVT ArgVT = Op.getNode()->getValueType(0);
27202 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27203 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
27204 uint8_t ArgMode;
27205
27206 // Decide which area this value should be read from.
27207 // TODO: Implement the AMD64 ABI in its entirety. This simple
27208 // selection mechanism works only for the basic types.
27209 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27209, __extension__
__PRETTY_FUNCTION__))
;
27210 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
27211 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
27212 } else {
27213 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__
__PRETTY_FUNCTION__))
27214 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__
__PRETTY_FUNCTION__))
;
27215 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
27216 }
27217
27218 if (ArgMode == 2) {
27219 // Make sure using fp_offset makes sense.
27220 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))
27221 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))
27222 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))
;
27223 }
27224
27225 // Insert VAARG node into the DAG
27226 // VAARG returns two values: Variable Argument Address, Chain
27227 SDValue InstOps[] = {Chain, SrcPtr,
27228 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
27229 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
27230 DAG.getTargetConstant(Align, dl, MVT::i32)};
27231 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
27232 SDValue VAARG = DAG.getMemIntrinsicNode(
27233 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
27234 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
27235 /*Alignment=*/std::nullopt,
27236 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
27237 Chain = VAARG.getValue(1);
27238
27239 // Load the next argument and return it
27240 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
27241}
27242
27243static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
27244 SelectionDAG &DAG) {
27245 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
27246 // where a va_list is still an i8*.
27247 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27247, __extension__
__PRETTY_FUNCTION__))
;
27248 if (Subtarget.isCallingConvWin64(
27249 DAG.getMachineFunction().getFunction().getCallingConv()))
27250 // Probably a Win64 va_copy.
27251 return DAG.expandVACopy(Op.getNode());
27252
27253 SDValue Chain = Op.getOperand(0);
27254 SDValue DstPtr = Op.getOperand(1);
27255 SDValue SrcPtr = Op.getOperand(2);
27256 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
27257 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27258 SDLoc DL(Op);
27259
27260 return DAG.getMemcpy(
27261 Chain, DL, DstPtr, SrcPtr,
27262 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
27263 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
27264 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
27265}
27266
27267// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
27268static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
27269 switch (Opc) {
27270 case ISD::SHL:
27271 case X86ISD::VSHL:
27272 case X86ISD::VSHLI:
27273 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
27274 case ISD::SRL:
27275 case X86ISD::VSRL:
27276 case X86ISD::VSRLI:
27277 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
27278 case ISD::SRA:
27279 case X86ISD::VSRA:
27280 case X86ISD::VSRAI:
27281 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
27282 }
27283 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27283)
;
27284}
27285
27286/// Handle vector element shifts where the shift amount is a constant.
27287/// Takes immediate version of shift as input.
27288static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
27289 SDValue SrcOp, uint64_t ShiftAmt,
27290 SelectionDAG &DAG) {
27291 MVT ElementType = VT.getVectorElementType();
27292
27293 // Bitcast the source vector to the output type, this is mainly necessary for
27294 // vXi8/vXi64 shifts.
27295 if (VT != SrcOp.getSimpleValueType())
27296 SrcOp = DAG.getBitcast(VT, SrcOp);
27297
27298 // Fold this packed shift into its first operand if ShiftAmt is 0.
27299 if (ShiftAmt == 0)
27300 return SrcOp;
27301
27302 // Check for ShiftAmt >= element width
27303 if (ShiftAmt >= ElementType.getSizeInBits()) {
27304 if (Opc == X86ISD::VSRAI)
27305 ShiftAmt = ElementType.getSizeInBits() - 1;
27306 else
27307 return DAG.getConstant(0, dl, VT);
27308 }
27309
27310 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__
__PRETTY_FUNCTION__))
27311 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__
__PRETTY_FUNCTION__))
;
27312
27313 // Fold this packed vector shift into a build vector if SrcOp is a
27314 // vector of Constants or UNDEFs.
27315 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
27316 unsigned ShiftOpc;
27317 switch (Opc) {
27318 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27318)
;
27319 case X86ISD::VSHLI:
27320 ShiftOpc = ISD::SHL;
27321 break;
27322 case X86ISD::VSRLI:
27323 ShiftOpc = ISD::SRL;
27324 break;
27325 case X86ISD::VSRAI:
27326 ShiftOpc = ISD::SRA;
27327 break;
27328 }
27329
27330 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
27331 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
27332 return C;
27333 }
27334
27335 return DAG.getNode(Opc, dl, VT, SrcOp,
27336 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
27337}
27338
27339/// Handle vector element shifts by a splat shift amount
27340static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
27341 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
27342 const X86Subtarget &Subtarget,
27343 SelectionDAG &DAG) {
27344 MVT AmtVT = ShAmt.getSimpleValueType();
27345 assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27345, __extension__
__PRETTY_FUNCTION__))
;
27346 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__
__PRETTY_FUNCTION__))
27347 "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__
__PRETTY_FUNCTION__))
;
27348
27349 // Move the splat element to the bottom element.
27350 if (ShAmtIdx != 0) {
27351 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
27352 Mask[0] = ShAmtIdx;
27353 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
27354 }
27355
27356 // Peek through any zext node if we can get back to a 128-bit source.
27357 if (AmtVT.getScalarSizeInBits() == 64 &&
27358 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
27359 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
27360 ShAmt.getOperand(0).getValueType().isSimple() &&
27361 ShAmt.getOperand(0).getValueType().is128BitVector()) {
27362 ShAmt = ShAmt.getOperand(0);
27363 AmtVT = ShAmt.getSimpleValueType();
27364 }
27365
27366 // See if we can mask off the upper elements using the existing source node.
27367 // The shift uses the entire lower 64-bits of the amount vector, so no need to
27368 // do this for vXi64 types.
27369 bool IsMasked = false;
27370 if (AmtVT.getScalarSizeInBits() < 64) {
27371 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
27372 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
27373 // If the shift amount has come from a scalar, then zero-extend the scalar
27374 // before moving to the vector.
27375 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
27376 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
27377 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
27378 AmtVT = MVT::v4i32;
27379 IsMasked = true;
27380 } else if (ShAmt.getOpcode() == ISD::AND) {
27381 // See if the shift amount is already masked (e.g. for rotation modulo),
27382 // then we can zero-extend it by setting all the other mask elements to
27383 // zero.
27384 SmallVector<SDValue> MaskElts(
27385 AmtVT.getVectorNumElements(),
27386 DAG.getConstant(0, dl, AmtVT.getScalarType()));
27387 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
27388 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
27389 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
27390 {ShAmt.getOperand(1), Mask}))) {
27391 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
27392 IsMasked = true;
27393 }
27394 }
27395 }
27396
27397 // Extract if the shift amount vector is larger than 128-bits.
27398 if (AmtVT.getSizeInBits() > 128) {
27399 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
27400 AmtVT = ShAmt.getSimpleValueType();
27401 }
27402
27403 // Zero-extend bottom element to v2i64 vector type, either by extension or
27404 // shuffle masking.
27405 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
27406 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
27407 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
27408 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
27409 } else if (Subtarget.hasSSE41()) {
27410 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
27411 MVT::v2i64, ShAmt);
27412 } else {
27413 SDValue ByteShift = DAG.getTargetConstant(
27414 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
27415 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
27416 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27417 ByteShift);
27418 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27419 ByteShift);
27420 }
27421 }
27422
27423 // Change opcode to non-immediate version.
27424 Opc = getTargetVShiftUniformOpcode(Opc, true);
27425
27426 // The return type has to be a 128-bit type with the same element
27427 // type as the input type.
27428 MVT EltVT = VT.getVectorElementType();
27429 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
27430
27431 ShAmt = DAG.getBitcast(ShVT, ShAmt);
27432 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
27433}
27434
27435/// Return Mask with the necessary casting or extending
27436/// for \p Mask according to \p MaskVT when lowering masking intrinsics
27437static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
27438 const X86Subtarget &Subtarget, SelectionDAG &DAG,
27439 const SDLoc &dl) {
27440
27441 if (isAllOnesConstant(Mask))
27442 return DAG.getConstant(1, dl, MaskVT);
27443 if (X86::isZeroNode(Mask))
27444 return DAG.getConstant(0, dl, MaskVT);
27445
27446 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27446, __extension__
__PRETTY_FUNCTION__))
;
27447
27448 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
27449 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27449, __extension__
__PRETTY_FUNCTION__))
;
27450 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27450, __extension__
__PRETTY_FUNCTION__))
;
27451 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
27452 SDValue Lo, Hi;
27453 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
27454 Lo = DAG.getBitcast(MVT::v32i1, Lo);
27455 Hi = DAG.getBitcast(MVT::v32i1, Hi);
27456 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
27457 } else {
27458 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
27459 Mask.getSimpleValueType().getSizeInBits());
27460 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
27461 // are extracted by EXTRACT_SUBVECTOR.
27462 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
27463 DAG.getBitcast(BitcastVT, Mask),
27464 DAG.getIntPtrConstant(0, dl));
27465 }
27466}
27467
27468/// Return (and \p Op, \p Mask) for compare instructions or
27469/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
27470/// necessary casting or extending for \p Mask when lowering masking intrinsics
27471static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
27472 SDValue PreservedSrc,
27473 const X86Subtarget &Subtarget,
27474 SelectionDAG &DAG) {
27475 MVT VT = Op.getSimpleValueType();
27476 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
27477 unsigned OpcodeSelect = ISD::VSELECT;
27478 SDLoc dl(Op);
27479
27480 if (isAllOnesConstant(Mask))
27481 return Op;
27482
27483 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27484
27485 if (PreservedSrc.isUndef())
27486 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27487 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
27488}
27489
27490/// Creates an SDNode for a predicated scalar operation.
27491/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
27492/// The mask is coming as MVT::i8 and it should be transformed
27493/// to MVT::v1i1 while lowering masking intrinsics.
27494/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
27495/// "X86select" instead of "vselect". We just can't create the "vselect" node
27496/// for a scalar instruction.
27497static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
27498 SDValue PreservedSrc,
27499 const X86Subtarget &Subtarget,
27500 SelectionDAG &DAG) {
27501
27502 if (auto *MaskConst
7.1
'MaskConst' is non-null
7.1
'MaskConst' is non-null
= dyn_cast<ConstantSDNode>(Mask))
8
Taking true branch
27503 if (MaskConst->getZExtValue() & 0x1)
9
Assuming the condition is false
10
Taking false branch
27504 return Op;
27505
27506 MVT VT = Op.getSimpleValueType();
27507 SDLoc dl(Op);
27508
27509 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27509, __extension__
__PRETTY_FUNCTION__))
;
11
'?' condition is true
27510 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
27511 DAG.getBitcast(MVT::v8i1, Mask),
27512 DAG.getIntPtrConstant(0, dl));
27513 if (Op.getOpcode() == X86ISD::FSETCCM ||
12
Assuming the condition is false
15
Taking false branch
27514 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
13
Assuming the condition is false
27515 Op.getOpcode() == X86ISD::VFPCLASSS)
14
Assuming the condition is false
27516 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
27517
27518 if (PreservedSrc.isUndef())
16
Calling 'SDValue::isUndef'
27519 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27520 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
27521}
27522
27523static int getSEHRegistrationNodeSize(const Function *Fn) {
27524 if (!Fn->hasPersonalityFn())
27525 report_fatal_error(
27526 "querying registration node size for function without personality");
27527 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
27528 // WinEHStatePass for the full struct definition.
27529 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
27530 case EHPersonality::MSVC_X86SEH: return 24;
27531 case EHPersonality::MSVC_CXX: return 16;
27532 default: break;
27533 }
27534 report_fatal_error(
27535 "can only recover FP for 32-bit MSVC EH personality functions");
27536}
27537
27538/// When the MSVC runtime transfers control to us, either to an outlined
27539/// function or when returning to a parent frame after catching an exception, we
27540/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
27541/// Here's the math:
27542/// RegNodeBase = EntryEBP - RegNodeSize
27543/// ParentFP = RegNodeBase - ParentFrameOffset
27544/// Subtracting RegNodeSize takes us to the offset of the registration node, and
27545/// subtracting the offset (negative on x86) takes us back to the parent FP.
27546static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
27547 SDValue EntryEBP) {
27548 MachineFunction &MF = DAG.getMachineFunction();
27549 SDLoc dl;
27550
27551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27552 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27553
27554 // It's possible that the parent function no longer has a personality function
27555 // if the exceptional code was optimized away, in which case we just return
27556 // the incoming EBP.
27557 if (!Fn->hasPersonalityFn())
27558 return EntryEBP;
27559
27560 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
27561 // registration, or the .set_setframe offset.
27562 MCSymbol *OffsetSym =
27563 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
27564 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
27565 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
27566 SDValue ParentFrameOffset =
27567 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
27568
27569 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
27570 // prologue to RBP in the parent function.
27571 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
27572 if (Subtarget.is64Bit())
27573 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
27574
27575 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
27576 // RegNodeBase = EntryEBP - RegNodeSize
27577 // ParentFP = RegNodeBase - ParentFrameOffset
27578 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
27579 DAG.getConstant(RegNodeSize, dl, PtrVT));
27580 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
27581}
27582
27583SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
27584 SelectionDAG &DAG) const {
27585 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
27586 auto isRoundModeCurDirection = [](SDValue Rnd) {
27587 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
27588 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
27589
27590 return false;
27591 };
27592 auto isRoundModeSAE = [](SDValue Rnd) {
27593 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27594 unsigned RC = C->getZExtValue();
27595 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27596 // Clear the NO_EXC bit and check remaining bits.
27597 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27598 // As a convenience we allow no other bits or explicitly
27599 // current direction.
27600 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
27601 }
27602 }
27603
27604 return false;
27605 };
27606 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
27607 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27608 RC = C->getZExtValue();
27609 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27610 // Clear the NO_EXC bit and check remaining bits.
27611 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27612 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
27613 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
27614 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
27615 RC == X86::STATIC_ROUNDING::TO_ZERO;
27616 }
27617 }
27618
27619 return false;
27620 };
27621
27622 SDLoc dl(Op);
27623 unsigned IntNo = Op.getConstantOperandVal(0);
27624 MVT VT = Op.getSimpleValueType();
27625 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
27626
27627 // Propagate flags from original node to transformed node(s).
27628 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
27629
27630 if (IntrData
0.1
'IntrData' is non-null
0.1
'IntrData' is non-null
) {
1
Taking true branch
27631 switch(IntrData->Type) {
2
Control jumps to 'case FPCLASSS:' at line 27983
27632 case INTR_TYPE_1OP: {
27633 // We specify 2 possible opcodes for intrinsics with rounding modes.
27634 // First, we check if the intrinsic may have non-default rounding mode,
27635 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27636 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27637 if (IntrWithRoundingModeOpcode != 0) {
27638 SDValue Rnd = Op.getOperand(2);
27639 unsigned RC = 0;
27640 if (isRoundModeSAEToX(Rnd, RC))
27641 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27642 Op.getOperand(1),
27643 DAG.getTargetConstant(RC, dl, MVT::i32));
27644 if (!isRoundModeCurDirection(Rnd))
27645 return SDValue();
27646 }
27647 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27648 Op.getOperand(1));
27649 }
27650 case INTR_TYPE_1OP_SAE: {
27651 SDValue Sae = Op.getOperand(2);
27652
27653 unsigned Opc;
27654 if (isRoundModeCurDirection(Sae))
27655 Opc = IntrData->Opc0;
27656 else if (isRoundModeSAE(Sae))
27657 Opc = IntrData->Opc1;
27658 else
27659 return SDValue();
27660
27661 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
27662 }
27663 case INTR_TYPE_2OP: {
27664 SDValue Src2 = Op.getOperand(2);
27665
27666 // We specify 2 possible opcodes for intrinsics with rounding modes.
27667 // First, we check if the intrinsic may have non-default rounding mode,
27668 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27669 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27670 if (IntrWithRoundingModeOpcode != 0) {
27671 SDValue Rnd = Op.getOperand(3);
27672 unsigned RC = 0;
27673 if (isRoundModeSAEToX(Rnd, RC))
27674 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27675 Op.getOperand(1), Src2,
27676 DAG.getTargetConstant(RC, dl, MVT::i32));
27677 if (!isRoundModeCurDirection(Rnd))
27678 return SDValue();
27679 }
27680
27681 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27682 Op.getOperand(1), Src2);
27683 }
27684 case INTR_TYPE_2OP_SAE: {
27685 SDValue Sae = Op.getOperand(3);
27686
27687 unsigned Opc;
27688 if (isRoundModeCurDirection(Sae))
27689 Opc = IntrData->Opc0;
27690 else if (isRoundModeSAE(Sae))
27691 Opc = IntrData->Opc1;
27692 else
27693 return SDValue();
27694
27695 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
27696 Op.getOperand(2));
27697 }
27698 case INTR_TYPE_3OP:
27699 case INTR_TYPE_3OP_IMM8: {
27700 SDValue Src1 = Op.getOperand(1);
27701 SDValue Src2 = Op.getOperand(2);
27702 SDValue Src3 = Op.getOperand(3);
27703
27704 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
27705 Src3.getValueType() != MVT::i8) {
27706 Src3 = DAG.getTargetConstant(
27707 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
27708 }
27709
27710 // We specify 2 possible opcodes for intrinsics with rounding modes.
27711 // First, we check if the intrinsic may have non-default rounding mode,
27712 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27713 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27714 if (IntrWithRoundingModeOpcode != 0) {
27715 SDValue Rnd = Op.getOperand(4);
27716 unsigned RC = 0;
27717 if (isRoundModeSAEToX(Rnd, RC))
27718 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27719 Src1, Src2, Src3,
27720 DAG.getTargetConstant(RC, dl, MVT::i32));
27721 if (!isRoundModeCurDirection(Rnd))
27722 return SDValue();
27723 }
27724
27725 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27726 {Src1, Src2, Src3});
27727 }
27728 case INTR_TYPE_4OP_IMM8: {
27729 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27729, __extension__
__PRETTY_FUNCTION__))
;
27730 SDValue Src4 = Op.getOperand(4);
27731 if (Src4.getValueType() != MVT::i8) {
27732 Src4 = DAG.getTargetConstant(
27733 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
27734 }
27735
27736 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27737 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
27738 Src4);
27739 }
27740 case INTR_TYPE_1OP_MASK: {
27741 SDValue Src = Op.getOperand(1);
27742 SDValue PassThru = Op.getOperand(2);
27743 SDValue Mask = Op.getOperand(3);
27744 // We add rounding mode to the Node when
27745 // - RC Opcode is specified and
27746 // - RC is not "current direction".
27747 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27748 if (IntrWithRoundingModeOpcode != 0) {
27749 SDValue Rnd = Op.getOperand(4);
27750 unsigned RC = 0;
27751 if (isRoundModeSAEToX(Rnd, RC))
27752 return getVectorMaskingNode(
27753 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27754 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
27755 Mask, PassThru, Subtarget, DAG);
27756 if (!isRoundModeCurDirection(Rnd))
27757 return SDValue();
27758 }
27759 return getVectorMaskingNode(
27760 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
27761 Subtarget, DAG);
27762 }
27763 case INTR_TYPE_1OP_MASK_SAE: {
27764 SDValue Src = Op.getOperand(1);
27765 SDValue PassThru = Op.getOperand(2);
27766 SDValue Mask = Op.getOperand(3);
27767 SDValue Rnd = Op.getOperand(4);
27768
27769 unsigned Opc;
27770 if (isRoundModeCurDirection(Rnd))
27771 Opc = IntrData->Opc0;
27772 else if (isRoundModeSAE(Rnd))
27773 Opc = IntrData->Opc1;
27774 else
27775 return SDValue();
27776
27777 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
27778 Subtarget, DAG);
27779 }
27780 case INTR_TYPE_SCALAR_MASK: {
27781 SDValue Src1 = Op.getOperand(1);
27782 SDValue Src2 = Op.getOperand(2);
27783 SDValue passThru = Op.getOperand(3);
27784 SDValue Mask = Op.getOperand(4);
27785 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27786 // There are 2 kinds of intrinsics in this group:
27787 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
27788 // (2) With rounding mode and sae - 7 operands.
27789 bool HasRounding = IntrWithRoundingModeOpcode != 0;
27790 if (Op.getNumOperands() == (5U + HasRounding)) {
27791 if (HasRounding) {
27792 SDValue Rnd = Op.getOperand(5);
27793 unsigned RC = 0;
27794 if (isRoundModeSAEToX(Rnd, RC))
27795 return getScalarMaskingNode(
27796 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
27797 DAG.getTargetConstant(RC, dl, MVT::i32)),
27798 Mask, passThru, Subtarget, DAG);
27799 if (!isRoundModeCurDirection(Rnd))
27800 return SDValue();
27801 }
27802 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
27803 Src2),
27804 Mask, passThru, Subtarget, DAG);
27805 }
27806
27807 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__
__PRETTY_FUNCTION__))
27808 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__
__PRETTY_FUNCTION__))
;
27809 SDValue RoundingMode = Op.getOperand(5);
27810 unsigned Opc = IntrData->Opc0;
27811 if (HasRounding) {
27812 SDValue Sae = Op.getOperand(6);
27813 if (isRoundModeSAE(Sae))
27814 Opc = IntrWithRoundingModeOpcode;
27815 else if (!isRoundModeCurDirection(Sae))
27816 return SDValue();
27817 }
27818 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
27819 Src2, RoundingMode),
27820 Mask, passThru, Subtarget, DAG);
27821 }
27822 case INTR_TYPE_SCALAR_MASK_RND: {
27823 SDValue Src1 = Op.getOperand(1);
27824 SDValue Src2 = Op.getOperand(2);
27825 SDValue passThru = Op.getOperand(3);
27826 SDValue Mask = Op.getOperand(4);
27827 SDValue Rnd = Op.getOperand(5);
27828
27829 SDValue NewOp;
27830 unsigned RC = 0;
27831 if (isRoundModeCurDirection(Rnd))
27832 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27833 else if (isRoundModeSAEToX(Rnd, RC))
27834 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27835 DAG.getTargetConstant(RC, dl, MVT::i32));
27836 else
27837 return SDValue();
27838
27839 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
27840 }
27841 case INTR_TYPE_SCALAR_MASK_SAE: {
27842 SDValue Src1 = Op.getOperand(1);
27843 SDValue Src2 = Op.getOperand(2);
27844 SDValue passThru = Op.getOperand(3);
27845 SDValue Mask = Op.getOperand(4);
27846 SDValue Sae = Op.getOperand(5);
27847 unsigned Opc;
27848 if (isRoundModeCurDirection(Sae))
27849 Opc = IntrData->Opc0;
27850 else if (isRoundModeSAE(Sae))
27851 Opc = IntrData->Opc1;
27852 else
27853 return SDValue();
27854
27855 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27856 Mask, passThru, Subtarget, DAG);
27857 }
27858 case INTR_TYPE_2OP_MASK: {
27859 SDValue Src1 = Op.getOperand(1);
27860 SDValue Src2 = Op.getOperand(2);
27861 SDValue PassThru = Op.getOperand(3);
27862 SDValue Mask = Op.getOperand(4);
27863 SDValue NewOp;
27864 if (IntrData->Opc1 != 0) {
27865 SDValue Rnd = Op.getOperand(5);
27866 unsigned RC = 0;
27867 if (isRoundModeSAEToX(Rnd, RC))
27868 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27869 DAG.getTargetConstant(RC, dl, MVT::i32));
27870 else if (!isRoundModeCurDirection(Rnd))
27871 return SDValue();
27872 }
27873 if (!NewOp)
27874 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27875 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27876 }
27877 case INTR_TYPE_2OP_MASK_SAE: {
27878 SDValue Src1 = Op.getOperand(1);
27879 SDValue Src2 = Op.getOperand(2);
27880 SDValue PassThru = Op.getOperand(3);
27881 SDValue Mask = Op.getOperand(4);
27882
27883 unsigned Opc = IntrData->Opc0;
27884 if (IntrData->Opc1 != 0) {
27885 SDValue Sae = Op.getOperand(5);
27886 if (isRoundModeSAE(Sae))
27887 Opc = IntrData->Opc1;
27888 else if (!isRoundModeCurDirection(Sae))
27889 return SDValue();
27890 }
27891
27892 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27893 Mask, PassThru, Subtarget, DAG);
27894 }
27895 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
27896 SDValue Src1 = Op.getOperand(1);
27897 SDValue Src2 = Op.getOperand(2);
27898 SDValue Src3 = Op.getOperand(3);
27899 SDValue PassThru = Op.getOperand(4);
27900 SDValue Mask = Op.getOperand(5);
27901 SDValue Sae = Op.getOperand(6);
27902 unsigned Opc;
27903 if (isRoundModeCurDirection(Sae))
27904 Opc = IntrData->Opc0;
27905 else if (isRoundModeSAE(Sae))
27906 Opc = IntrData->Opc1;
27907 else
27908 return SDValue();
27909
27910 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27911 Mask, PassThru, Subtarget, DAG);
27912 }
27913 case INTR_TYPE_3OP_MASK_SAE: {
27914 SDValue Src1 = Op.getOperand(1);
27915 SDValue Src2 = Op.getOperand(2);
27916 SDValue Src3 = Op.getOperand(3);
27917 SDValue PassThru = Op.getOperand(4);
27918 SDValue Mask = Op.getOperand(5);
27919
27920 unsigned Opc = IntrData->Opc0;
27921 if (IntrData->Opc1 != 0) {
27922 SDValue Sae = Op.getOperand(6);
27923 if (isRoundModeSAE(Sae))
27924 Opc = IntrData->Opc1;
27925 else if (!isRoundModeCurDirection(Sae))
27926 return SDValue();
27927 }
27928 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27929 Mask, PassThru, Subtarget, DAG);
27930 }
27931 case BLENDV: {
27932 SDValue Src1 = Op.getOperand(1);
27933 SDValue Src2 = Op.getOperand(2);
27934 SDValue Src3 = Op.getOperand(3);
27935
27936 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
27937 Src3 = DAG.getBitcast(MaskVT, Src3);
27938
27939 // Reverse the operands to match VSELECT order.
27940 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
27941 }
27942 case VPERM_2OP : {
27943 SDValue Src1 = Op.getOperand(1);
27944 SDValue Src2 = Op.getOperand(2);
27945
27946 // Swap Src1 and Src2 in the node creation
27947 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
27948 }
27949 case CFMA_OP_MASKZ:
27950 case CFMA_OP_MASK: {
27951 SDValue Src1 = Op.getOperand(1);
27952 SDValue Src2 = Op.getOperand(2);
27953 SDValue Src3 = Op.getOperand(3);
27954 SDValue Mask = Op.getOperand(4);
27955 MVT VT = Op.getSimpleValueType();
27956
27957 SDValue PassThru = Src3;
27958 if (IntrData->Type == CFMA_OP_MASKZ)
27959 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
27960
27961 // We add rounding mode to the Node when
27962 // - RC Opcode is specified and
27963 // - RC is not "current direction".
27964 SDValue NewOp;
27965 if (IntrData->Opc1 != 0) {
27966 SDValue Rnd = Op.getOperand(5);
27967 unsigned RC = 0;
27968 if (isRoundModeSAEToX(Rnd, RC))
27969 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
27970 DAG.getTargetConstant(RC, dl, MVT::i32));
27971 else if (!isRoundModeCurDirection(Rnd))
27972 return SDValue();
27973 }
27974 if (!NewOp)
27975 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
27976 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27977 }
27978 case IFMA_OP:
27979 // NOTE: We need to swizzle the operands to pass the multiply operands
27980 // first.
27981 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27982 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
27983 case FPCLASSS: {
27984 SDValue Src1 = Op.getOperand(1);
27985 SDValue Imm = Op.getOperand(2);
27986 SDValue Mask = Op.getOperand(3);
27987 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
27988 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
3
Calling defaulted default constructor for 'SDValue'
6
Returning from default constructor for 'SDValue'
7
Calling 'getScalarMaskingNode'
27989 Subtarget, DAG);
27990 // Need to fill with zeros to ensure the bitcast will produce zeroes
27991 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27992 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27993 DAG.getConstant(0, dl, MVT::v8i1),
27994 FPclassMask, DAG.getIntPtrConstant(0, dl));
27995 return DAG.getBitcast(MVT::i8, Ins);
27996 }
27997
27998 case CMP_MASK_CC: {
27999 MVT MaskVT = Op.getSimpleValueType();
28000 SDValue CC = Op.getOperand(3);
28001 SDValue Mask = Op.getOperand(4);
28002 // We specify 2 possible opcodes for intrinsics with rounding modes.
28003 // First, we check if the intrinsic may have non-default rounding mode,
28004 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
28005 if (IntrData->Opc1 != 0) {
28006 SDValue Sae = Op.getOperand(5);
28007 if (isRoundModeSAE(Sae))
28008 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
28009 Op.getOperand(2), CC, Mask, Sae);
28010 if (!isRoundModeCurDirection(Sae))
28011 return SDValue();
28012 }
28013 //default rounding mode
28014 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
28015 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
28016 }
28017 case CMP_MASK_SCALAR_CC: {
28018 SDValue Src1 = Op.getOperand(1);
28019 SDValue Src2 = Op.getOperand(2);
28020 SDValue CC = Op.getOperand(3);
28021 SDValue Mask = Op.getOperand(4);
28022
28023 SDValue Cmp;
28024 if (IntrData->Opc1 != 0) {
28025 SDValue Sae = Op.getOperand(5);
28026 if (isRoundModeSAE(Sae))
28027 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
28028 else if (!isRoundModeCurDirection(Sae))
28029 return SDValue();
28030 }
28031 //default rounding mode
28032 if (!Cmp.getNode())
28033 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
28034
28035 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
28036 Subtarget, DAG);
28037 // Need to fill with zeros to ensure the bitcast will produce zeroes
28038 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28039 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
28040 DAG.getConstant(0, dl, MVT::v8i1),
28041 CmpMask, DAG.getIntPtrConstant(0, dl));
28042 return DAG.getBitcast(MVT::i8, Ins);
28043 }
28044 case COMI: { // Comparison intrinsics
28045 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
28046 SDValue LHS = Op.getOperand(1);
28047 SDValue RHS = Op.getOperand(2);
28048 // Some conditions require the operands to be swapped.
28049 if (CC == ISD::SETLT || CC == ISD::SETLE)
28050 std::swap(LHS, RHS);
28051
28052 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
28053 SDValue SetCC;
28054 switch (CC) {
28055 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
28056 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
28057 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
28058 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
28059 break;
28060 }
28061 case ISD::SETNE: { // (ZF = 1 or PF = 1)
28062 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
28063 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
28064 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
28065 break;
28066 }
28067 case ISD::SETGT: // (CF = 0 and ZF = 0)
28068 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
28069 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
28070 break;
28071 }
28072 case ISD::SETGE: // CF = 0
28073 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
28074 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
28075 break;
28076 default:
28077 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28077)
;
28078 }
28079 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28080 }
28081 case COMI_RM: { // Comparison intrinsics with Sae
28082 SDValue LHS = Op.getOperand(1);
28083 SDValue RHS = Op.getOperand(2);
28084 unsigned CondVal = Op.getConstantOperandVal(3);
28085 SDValue Sae = Op.getOperand(4);
28086
28087 SDValue FCmp;
28088 if (isRoundModeCurDirection(Sae))
28089 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
28090 DAG.getTargetConstant(CondVal, dl, MVT::i8));
28091 else if (isRoundModeSAE(Sae))
28092 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
28093 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
28094 else
28095 return SDValue();
28096 // Need to fill with zeros to ensure the bitcast will produce zeroes
28097 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28098 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
28099 DAG.getConstant(0, dl, MVT::v16i1),
28100 FCmp, DAG.getIntPtrConstant(0, dl));
28101 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
28102 DAG.getBitcast(MVT::i16, Ins));
28103 }
28104 case VSHIFT: {
28105 SDValue SrcOp = Op.getOperand(1);
28106 SDValue ShAmt = Op.getOperand(2);
28107 assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__
__PRETTY_FUNCTION__))
28108 "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__
__PRETTY_FUNCTION__))
;
28109
28110 // Catch shift-by-constant.
28111 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
28112 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
28113 Op.getSimpleValueType(), SrcOp,
28114 CShAmt->getZExtValue(), DAG);
28115
28116 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
28117 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
28118 SrcOp, ShAmt, 0, Subtarget, DAG);
28119 }
28120 case COMPRESS_EXPAND_IN_REG: {
28121 SDValue Mask = Op.getOperand(3);
28122 SDValue DataToCompress = Op.getOperand(1);
28123 SDValue PassThru = Op.getOperand(2);
28124 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
28125 return Op.getOperand(1);
28126
28127 // Avoid false dependency.
28128 if (PassThru.isUndef())
28129 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
28130
28131 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
28132 Mask);
28133 }
28134 case FIXUPIMM:
28135 case FIXUPIMM_MASKZ: {
28136 SDValue Src1 = Op.getOperand(1);
28137 SDValue Src2 = Op.getOperand(2);
28138 SDValue Src3 = Op.getOperand(3);
28139 SDValue Imm = Op.getOperand(4);
28140 SDValue Mask = Op.getOperand(5);
28141 SDValue Passthru = (IntrData->Type == FIXUPIMM)
28142 ? Src1
28143 : getZeroVector(VT, Subtarget, DAG, dl);
28144
28145 unsigned Opc = IntrData->Opc0;
28146 if (IntrData->Opc1 != 0) {
28147 SDValue Sae = Op.getOperand(6);
28148 if (isRoundModeSAE(Sae))
28149 Opc = IntrData->Opc1;
28150 else if (!isRoundModeCurDirection(Sae))
28151 return SDValue();
28152 }
28153
28154 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
28155
28156 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
28157 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28158
28159 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28160 }
28161 case ROUNDP: {
28162 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28162, __extension__
__PRETTY_FUNCTION__))
;
28163 // Clear the upper bits of the rounding immediate so that the legacy
28164 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28165 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
28166 SDValue RoundingMode =
28167 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28168 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28169 Op.getOperand(1), RoundingMode);
28170 }
28171 case ROUNDS: {
28172 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28172, __extension__
__PRETTY_FUNCTION__))
;
28173 // Clear the upper bits of the rounding immediate so that the legacy
28174 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28175 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
28176 SDValue RoundingMode =
28177 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28178 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28179 Op.getOperand(1), Op.getOperand(2), RoundingMode);
28180 }
28181 case BEXTRI: {
28182 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28182, __extension__
__PRETTY_FUNCTION__))
;
28183
28184 uint64_t Imm = Op.getConstantOperandVal(2);
28185 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
28186 Op.getValueType());
28187 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28188 Op.getOperand(1), Control);
28189 }
28190 // ADC/ADCX/SBB
28191 case ADX: {
28192 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
28193 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
28194
28195 SDValue Res;
28196 // If the carry in is zero, then we should just use ADD/SUB instead of
28197 // ADC/SBB.
28198 if (isNullConstant(Op.getOperand(1))) {
28199 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
28200 Op.getOperand(3));
28201 } else {
28202 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
28203 DAG.getConstant(-1, dl, MVT::i8));
28204 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
28205 Op.getOperand(3), GenCF.getValue(1));
28206 }
28207 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
28208 SDValue Results[] = { SetCC, Res };
28209 return DAG.getMergeValues(Results, dl);
28210 }
28211 case CVTPD2PS_MASK:
28212 case CVTPD2DQ_MASK:
28213 case CVTQQ2PS_MASK:
28214 case TRUNCATE_TO_REG: {
28215 SDValue Src = Op.getOperand(1);
28216 SDValue PassThru = Op.getOperand(2);
28217 SDValue Mask = Op.getOperand(3);
28218
28219 if (isAllOnesConstant(Mask))
28220 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28221
28222 MVT SrcVT = Src.getSimpleValueType();
28223 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28224 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28225 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
28226 {Src, PassThru, Mask});
28227 }
28228 case CVTPS2PH_MASK: {
28229 SDValue Src = Op.getOperand(1);
28230 SDValue Rnd = Op.getOperand(2);
28231 SDValue PassThru = Op.getOperand(3);
28232 SDValue Mask = Op.getOperand(4);
28233
28234 unsigned RC = 0;
28235 unsigned Opc = IntrData->Opc0;
28236 bool SAE = Src.getValueType().is512BitVector() &&
28237 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
28238 if (SAE) {
28239 Opc = X86ISD::CVTPS2PH_SAE;
28240 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
28241 }
28242
28243 if (isAllOnesConstant(Mask))
28244 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
28245
28246 if (SAE)
28247 Opc = X86ISD::MCVTPS2PH_SAE;
28248 else
28249 Opc = IntrData->Opc1;
28250 MVT SrcVT = Src.getSimpleValueType();
28251 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28252 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28253 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
28254 }
28255 case CVTNEPS2BF16_MASK: {
28256 SDValue Src = Op.getOperand(1);
28257 SDValue PassThru = Op.getOperand(2);
28258 SDValue Mask = Op.getOperand(3);
28259
28260 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28261 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28262
28263 // Break false dependency.
28264 if (PassThru.isUndef())
28265 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
28266
28267 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
28268 Mask);
28269 }
28270 default:
28271 break;
28272 }
28273 }
28274
28275 switch (IntNo) {
28276 default: return SDValue(); // Don't custom lower most intrinsics.
28277
28278 // ptest and testp intrinsics. The intrinsic these come from are designed to
28279 // return an integer value, not just an instruction so lower it to the ptest
28280 // or testp pattern and a setcc for the result.
28281 case Intrinsic::x86_avx512_ktestc_b:
28282 case Intrinsic::x86_avx512_ktestc_w:
28283 case Intrinsic::x86_avx512_ktestc_d:
28284 case Intrinsic::x86_avx512_ktestc_q:
28285 case Intrinsic::x86_avx512_ktestz_b:
28286 case Intrinsic::x86_avx512_ktestz_w:
28287 case Intrinsic::x86_avx512_ktestz_d:
28288 case Intrinsic::x86_avx512_ktestz_q:
28289 case Intrinsic::x86_sse41_ptestz:
28290 case Intrinsic::x86_sse41_ptestc:
28291 case Intrinsic::x86_sse41_ptestnzc:
28292 case Intrinsic::x86_avx_ptestz_256:
28293 case Intrinsic::x86_avx_ptestc_256:
28294 case Intrinsic::x86_avx_ptestnzc_256:
28295 case Intrinsic::x86_avx_vtestz_ps:
28296 case Intrinsic::x86_avx_vtestc_ps:
28297 case Intrinsic::x86_avx_vtestnzc_ps:
28298 case Intrinsic::x86_avx_vtestz_pd:
28299 case Intrinsic::x86_avx_vtestc_pd:
28300 case Intrinsic::x86_avx_vtestnzc_pd:
28301 case Intrinsic::x86_avx_vtestz_ps_256:
28302 case Intrinsic::x86_avx_vtestc_ps_256:
28303 case Intrinsic::x86_avx_vtestnzc_ps_256:
28304 case Intrinsic::x86_avx_vtestz_pd_256:
28305 case Intrinsic::x86_avx_vtestc_pd_256:
28306 case Intrinsic::x86_avx_vtestnzc_pd_256: {
28307 unsigned TestOpc = X86ISD::PTEST;
28308 X86::CondCode X86CC;
28309 switch (IntNo) {
28310 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28310)
;
28311 case Intrinsic::x86_avx512_ktestc_b:
28312 case Intrinsic::x86_avx512_ktestc_w:
28313 case Intrinsic::x86_avx512_ktestc_d:
28314 case Intrinsic::x86_avx512_ktestc_q:
28315 // CF = 1
28316 TestOpc = X86ISD::KTEST;
28317 X86CC = X86::COND_B;
28318 break;
28319 case Intrinsic::x86_avx512_ktestz_b:
28320 case Intrinsic::x86_avx512_ktestz_w:
28321 case Intrinsic::x86_avx512_ktestz_d:
28322 case Intrinsic::x86_avx512_ktestz_q:
28323 TestOpc = X86ISD::KTEST;
28324 X86CC = X86::COND_E;
28325 break;
28326 case Intrinsic::x86_avx_vtestz_ps:
28327 case Intrinsic::x86_avx_vtestz_pd:
28328 case Intrinsic::x86_avx_vtestz_ps_256:
28329 case Intrinsic::x86_avx_vtestz_pd_256:
28330 TestOpc = X86ISD::TESTP;
28331 [[fallthrough]];
28332 case Intrinsic::x86_sse41_ptestz:
28333 case Intrinsic::x86_avx_ptestz_256:
28334 // ZF = 1
28335 X86CC = X86::COND_E;
28336 break;
28337 case Intrinsic::x86_avx_vtestc_ps:
28338 case Intrinsic::x86_avx_vtestc_pd:
28339 case Intrinsic::x86_avx_vtestc_ps_256:
28340 case Intrinsic::x86_avx_vtestc_pd_256:
28341 TestOpc = X86ISD::TESTP;
28342 [[fallthrough]];
28343 case Intrinsic::x86_sse41_ptestc:
28344 case Intrinsic::x86_avx_ptestc_256:
28345 // CF = 1
28346 X86CC = X86::COND_B;
28347 break;
28348 case Intrinsic::x86_avx_vtestnzc_ps:
28349 case Intrinsic::x86_avx_vtestnzc_pd:
28350 case Intrinsic::x86_avx_vtestnzc_ps_256:
28351 case Intrinsic::x86_avx_vtestnzc_pd_256:
28352 TestOpc = X86ISD::TESTP;
28353 [[fallthrough]];
28354 case Intrinsic::x86_sse41_ptestnzc:
28355 case Intrinsic::x86_avx_ptestnzc_256:
28356 // ZF and CF = 0
28357 X86CC = X86::COND_A;
28358 break;
28359 }
28360
28361 SDValue LHS = Op.getOperand(1);
28362 SDValue RHS = Op.getOperand(2);
28363 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
28364 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
28365 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28366 }
28367
28368 case Intrinsic::x86_sse42_pcmpistria128:
28369 case Intrinsic::x86_sse42_pcmpestria128:
28370 case Intrinsic::x86_sse42_pcmpistric128:
28371 case Intrinsic::x86_sse42_pcmpestric128:
28372 case Intrinsic::x86_sse42_pcmpistrio128:
28373 case Intrinsic::x86_sse42_pcmpestrio128:
28374 case Intrinsic::x86_sse42_pcmpistris128:
28375 case Intrinsic::x86_sse42_pcmpestris128:
28376 case Intrinsic::x86_sse42_pcmpistriz128:
28377 case Intrinsic::x86_sse42_pcmpestriz128: {
28378 unsigned Opcode;
28379 X86::CondCode X86CC;
28380 switch (IntNo) {
28381 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28381)
; // Can't reach here.
28382 case Intrinsic::x86_sse42_pcmpistria128:
28383 Opcode = X86ISD::PCMPISTR;
28384 X86CC = X86::COND_A;
28385 break;
28386 case Intrinsic::x86_sse42_pcmpestria128:
28387 Opcode = X86ISD::PCMPESTR;
28388 X86CC = X86::COND_A;
28389 break;
28390 case Intrinsic::x86_sse42_pcmpistric128:
28391 Opcode = X86ISD::PCMPISTR;
28392 X86CC = X86::COND_B;
28393 break;
28394 case Intrinsic::x86_sse42_pcmpestric128:
28395 Opcode = X86ISD::PCMPESTR;
28396 X86CC = X86::COND_B;
28397 break;
28398 case Intrinsic::x86_sse42_pcmpistrio128:
28399 Opcode = X86ISD::PCMPISTR;
28400 X86CC = X86::COND_O;
28401 break;
28402 case Intrinsic::x86_sse42_pcmpestrio128:
28403 Opcode = X86ISD::PCMPESTR;
28404 X86CC = X86::COND_O;
28405 break;
28406 case Intrinsic::x86_sse42_pcmpistris128:
28407 Opcode = X86ISD::PCMPISTR;
28408 X86CC = X86::COND_S;
28409 break;
28410 case Intrinsic::x86_sse42_pcmpestris128:
28411 Opcode = X86ISD::PCMPESTR;
28412 X86CC = X86::COND_S;
28413 break;
28414 case Intrinsic::x86_sse42_pcmpistriz128:
28415 Opcode = X86ISD::PCMPISTR;
28416 X86CC = X86::COND_E;
28417 break;
28418 case Intrinsic::x86_sse42_pcmpestriz128:
28419 Opcode = X86ISD::PCMPESTR;
28420 X86CC = X86::COND_E;
28421 break;
28422 }
28423 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28424 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28425 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
28426 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
28427 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28428 }
28429
28430 case Intrinsic::x86_sse42_pcmpistri128:
28431 case Intrinsic::x86_sse42_pcmpestri128: {
28432 unsigned Opcode;
28433 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
28434 Opcode = X86ISD::PCMPISTR;
28435 else
28436 Opcode = X86ISD::PCMPESTR;
28437
28438 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28439 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28440 return DAG.getNode(Opcode, dl, VTs, NewOps);
28441 }
28442
28443 case Intrinsic::x86_sse42_pcmpistrm128:
28444 case Intrinsic::x86_sse42_pcmpestrm128: {
28445 unsigned Opcode;
28446 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
28447 Opcode = X86ISD::PCMPISTR;
28448 else
28449 Opcode = X86ISD::PCMPESTR;
28450
28451 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28452 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28453 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
28454 }
28455
28456 case Intrinsic::eh_sjlj_lsda: {
28457 MachineFunction &MF = DAG.getMachineFunction();
28458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28459 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28460 auto &Context = MF.getMMI().getContext();
28461 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
28462 Twine(MF.getFunctionNumber()));
28463 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
28464 DAG.getMCSymbol(S, PtrVT));
28465 }
28466
28467 case Intrinsic::x86_seh_lsda: {
28468 // Compute the symbol for the LSDA. We know it'll get emitted later.
28469 MachineFunction &MF = DAG.getMachineFunction();
28470 SDValue Op1 = Op.getOperand(1);
28471 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
28472 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
28473 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
28474
28475 // Generate a simple absolute symbol reference. This intrinsic is only
28476 // supported on 32-bit Windows, which isn't PIC.
28477 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
28478 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
28479 }
28480
28481 case Intrinsic::eh_recoverfp: {
28482 SDValue FnOp = Op.getOperand(1);
28483 SDValue IncomingFPOp = Op.getOperand(2);
28484 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
28485 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
28486 if (!Fn)
28487 report_fatal_error(
28488 "llvm.eh.recoverfp must take a function as the first argument");
28489 return recoverFramePointer(DAG, Fn, IncomingFPOp);
28490 }
28491
28492 case Intrinsic::localaddress: {
28493 // Returns one of the stack, base, or frame pointer registers, depending on
28494 // which is used to reference local variables.
28495 MachineFunction &MF = DAG.getMachineFunction();
28496 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28497 unsigned Reg;
28498 if (RegInfo->hasBasePointer(MF))
28499 Reg = RegInfo->getBaseRegister();
28500 else { // Handles the SP or FP case.
28501 bool CantUseFP = RegInfo->hasStackRealignment(MF);
28502 if (CantUseFP)
28503 Reg = RegInfo->getPtrSizedStackRegister(MF);
28504 else
28505 Reg = RegInfo->getPtrSizedFrameRegister(MF);
28506 }
28507 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
28508 }
28509 case Intrinsic::x86_avx512_vp2intersect_q_512:
28510 case Intrinsic::x86_avx512_vp2intersect_q_256:
28511 case Intrinsic::x86_avx512_vp2intersect_q_128:
28512 case Intrinsic::x86_avx512_vp2intersect_d_512:
28513 case Intrinsic::x86_avx512_vp2intersect_d_256:
28514 case Intrinsic::x86_avx512_vp2intersect_d_128: {
28515 MVT MaskVT = Op.getSimpleValueType();
28516
28517 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
28518 SDLoc DL(Op);
28519
28520 SDValue Operation =
28521 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
28522 Op->getOperand(1), Op->getOperand(2));
28523
28524 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
28525 MaskVT, Operation);
28526 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
28527 MaskVT, Operation);
28528 return DAG.getMergeValues({Result0, Result1}, DL);
28529 }
28530 case Intrinsic::x86_mmx_pslli_w:
28531 case Intrinsic::x86_mmx_pslli_d:
28532 case Intrinsic::x86_mmx_pslli_q:
28533 case Intrinsic::x86_mmx_psrli_w:
28534 case Intrinsic::x86_mmx_psrli_d:
28535 case Intrinsic::x86_mmx_psrli_q:
28536 case Intrinsic::x86_mmx_psrai_w:
28537 case Intrinsic::x86_mmx_psrai_d: {
28538 SDLoc DL(Op);
28539 SDValue ShAmt = Op.getOperand(2);
28540 // If the argument is a constant, convert it to a target constant.
28541 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
28542 // Clamp out of bounds shift amounts since they will otherwise be masked
28543 // to 8-bits which may make it no longer out of bounds.
28544 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
28545 if (ShiftAmount == 0)
28546 return Op.getOperand(1);
28547
28548 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28549 Op.getOperand(0), Op.getOperand(1),
28550 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
28551 }
28552
28553 unsigned NewIntrinsic;
28554 switch (IntNo) {
28555 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28555)
; // Can't reach here.
28556 case Intrinsic::x86_mmx_pslli_w:
28557 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
28558 break;
28559 case Intrinsic::x86_mmx_pslli_d:
28560 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
28561 break;
28562 case Intrinsic::x86_mmx_pslli_q:
28563 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
28564 break;
28565 case Intrinsic::x86_mmx_psrli_w:
28566 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
28567 break;
28568 case Intrinsic::x86_mmx_psrli_d:
28569 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
28570 break;
28571 case Intrinsic::x86_mmx_psrli_q:
28572 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
28573 break;
28574 case Intrinsic::x86_mmx_psrai_w:
28575 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
28576 break;
28577 case Intrinsic::x86_mmx_psrai_d:
28578 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
28579 break;
28580 }
28581
28582 // The vector shift intrinsics with scalars uses 32b shift amounts but
28583 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
28584 // MMX register.
28585 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
28586 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28587 DAG.getTargetConstant(NewIntrinsic, DL,
28588 getPointerTy(DAG.getDataLayout())),
28589 Op.getOperand(1), ShAmt);
28590 }
28591 case Intrinsic::thread_pointer: {
28592 if (Subtarget.isTargetELF()) {
28593 SDLoc dl(Op);
28594 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28595 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
28596 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
28597 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
28598 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28599 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
28600 }
28601 report_fatal_error(
28602 "Target OS doesn't support __builtin_thread_pointer() yet.");
28603 }
28604 }
28605}
28606
28607static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28608 SDValue Src, SDValue Mask, SDValue Base,
28609 SDValue Index, SDValue ScaleOp, SDValue Chain,
28610 const X86Subtarget &Subtarget) {
28611 SDLoc dl(Op);
28612 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28613 // Scale must be constant.
28614 if (!C)
28615 return SDValue();
28616 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28617 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28618 TLI.getPointerTy(DAG.getDataLayout()));
28619 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
28620 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28621 // If source is undef or we know it won't be used, use a zero vector
28622 // to break register dependency.
28623 // TODO: use undef instead and let BreakFalseDeps deal with it?
28624 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28625 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28626
28627 // Cast mask to an integer type.
28628 Mask = DAG.getBitcast(MaskVT, Mask);
28629
28630 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28631
28632 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28633 SDValue Res =
28634 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28635 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28636 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28637}
28638
28639static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
28640 SDValue Src, SDValue Mask, SDValue Base,
28641 SDValue Index, SDValue ScaleOp, SDValue Chain,
28642 const X86Subtarget &Subtarget) {
28643 MVT VT = Op.getSimpleValueType();
28644 SDLoc dl(Op);
28645 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28646 // Scale must be constant.
28647 if (!C)
28648 return SDValue();
28649 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28650 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28651 TLI.getPointerTy(DAG.getDataLayout()));
28652 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28653 VT.getVectorNumElements());
28654 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28655
28656 // We support two versions of the gather intrinsics. One with scalar mask and
28657 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28658 if (Mask.getValueType() != MaskVT)
28659 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28660
28661 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28662 // If source is undef or we know it won't be used, use a zero vector
28663 // to break register dependency.
28664 // TODO: use undef instead and let BreakFalseDeps deal with it?
28665 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28666 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28667
28668 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28669
28670 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28671 SDValue Res =
28672 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28673 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28674 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28675}
28676
28677static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28678 SDValue Src, SDValue Mask, SDValue Base,
28679 SDValue Index, SDValue ScaleOp, SDValue Chain,
28680 const X86Subtarget &Subtarget) {
28681 SDLoc dl(Op);
28682 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28683 // Scale must be constant.
28684 if (!C)
28685 return SDValue();
28686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28687 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28688 TLI.getPointerTy(DAG.getDataLayout()));
28689 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28690 Src.getSimpleValueType().getVectorNumElements());
28691 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28692
28693 // We support two versions of the scatter intrinsics. One with scalar mask and
28694 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28695 if (Mask.getValueType() != MaskVT)
28696 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28697
28698 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28699
28700 SDVTList VTs = DAG.getVTList(MVT::Other);
28701 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
28702 SDValue Res =
28703 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28704 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28705 return Res;
28706}
28707
28708static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28709 SDValue Mask, SDValue Base, SDValue Index,
28710 SDValue ScaleOp, SDValue Chain,
28711 const X86Subtarget &Subtarget) {
28712 SDLoc dl(Op);
28713 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28714 // Scale must be constant.
28715 if (!C)
28716 return SDValue();
28717 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28718 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28719 TLI.getPointerTy(DAG.getDataLayout()));
28720 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
28721 SDValue Segment = DAG.getRegister(0, MVT::i32);
28722 MVT MaskVT =
28723 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
28724 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28725 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
28726 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
28727 return SDValue(Res, 0);
28728}
28729
28730/// Handles the lowering of builtin intrinsics with chain that return their
28731/// value into registers EDX:EAX.
28732/// If operand ScrReg is a valid register identifier, then operand 2 of N is
28733/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
28734/// TargetOpcode.
28735/// Returns a Glue value which can be used to add extra copy-from-reg if the
28736/// expanded intrinsics implicitly defines extra registers (i.e. not just
28737/// EDX:EAX).
28738static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
28739 SelectionDAG &DAG,
28740 unsigned TargetOpcode,
28741 unsigned SrcReg,
28742 const X86Subtarget &Subtarget,
28743 SmallVectorImpl<SDValue> &Results) {
28744 SDValue Chain = N->getOperand(0);
28745 SDValue Glue;
28746
28747 if (SrcReg) {
28748 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28748, __extension__
__PRETTY_FUNCTION__))
;
28749 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
28750 Glue = Chain.getValue(1);
28751 }
28752
28753 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28754 SDValue N1Ops[] = {Chain, Glue};
28755 SDNode *N1 = DAG.getMachineNode(
28756 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
28757 Chain = SDValue(N1, 0);
28758
28759 // Reads the content of XCR and returns it in registers EDX:EAX.
28760 SDValue LO, HI;
28761 if (Subtarget.is64Bit()) {
28762 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
28763 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
28764 LO.getValue(2));
28765 } else {
28766 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
28767 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
28768 LO.getValue(2));
28769 }
28770 Chain = HI.getValue(1);
28771 Glue = HI.getValue(2);
28772
28773 if (Subtarget.is64Bit()) {
28774 // Merge the two 32-bit values into a 64-bit one.
28775 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
28776 DAG.getConstant(32, DL, MVT::i8));
28777 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
28778 Results.push_back(Chain);
28779 return Glue;
28780 }
28781
28782 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
28783 SDValue Ops[] = { LO, HI };
28784 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
28785 Results.push_back(Pair);
28786 Results.push_back(Chain);
28787 return Glue;
28788}
28789
28790/// Handles the lowering of builtin intrinsics that read the time stamp counter
28791/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
28792/// READCYCLECOUNTER nodes.
28793static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
28794 SelectionDAG &DAG,
28795 const X86Subtarget &Subtarget,
28796 SmallVectorImpl<SDValue> &Results) {
28797 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
28798 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
28799 // and the EAX register is loaded with the low-order 32 bits.
28800 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
28801 /* NoRegister */0, Subtarget,
28802 Results);
28803 if (Opcode != X86::RDTSCP)
28804 return;
28805
28806 SDValue Chain = Results[1];
28807 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
28808 // the ECX register. Add 'ecx' explicitly to the chain.
28809 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
28810 Results[1] = ecx;
28811 Results.push_back(ecx.getValue(1));
28812}
28813
28814static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
28815 SelectionDAG &DAG) {
28816 SmallVector<SDValue, 3> Results;
28817 SDLoc DL(Op);
28818 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
28819 Results);
28820 return DAG.getMergeValues(Results, DL);
28821}
28822
28823static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
28824 MachineFunction &MF = DAG.getMachineFunction();
28825 SDValue Chain = Op.getOperand(0);
28826 SDValue RegNode = Op.getOperand(2);
28827 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28828 if (!EHInfo)
28829 report_fatal_error("EH registrations only live in functions using WinEH");
28830
28831 // Cast the operand to an alloca, and remember the frame index.
28832 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
28833 if (!FINode)
28834 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
28835 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
28836
28837 // Return the chain operand without making any DAG nodes.
28838 return Chain;
28839}
28840
28841static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
28842 MachineFunction &MF = DAG.getMachineFunction();
28843 SDValue Chain = Op.getOperand(0);
28844 SDValue EHGuard = Op.getOperand(2);
28845 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28846 if (!EHInfo)
28847 report_fatal_error("EHGuard only live in functions using WinEH");
28848
28849 // Cast the operand to an alloca, and remember the frame index.
28850 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
28851 if (!FINode)
28852 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
28853 EHInfo->EHGuardFrameIndex = FINode->getIndex();
28854
28855 // Return the chain operand without making any DAG nodes.
28856 return Chain;
28857}
28858
28859/// Emit Truncating Store with signed or unsigned saturation.
28860static SDValue
28861EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
28862 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
28863 SelectionDAG &DAG) {
28864 SDVTList VTs = DAG.getVTList(MVT::Other);
28865 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
28866 SDValue Ops[] = { Chain, Val, Ptr, Undef };
28867 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
28868 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28869}
28870
28871/// Emit Masked Truncating Store with signed or unsigned saturation.
28872static SDValue
28873EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
28874 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
28875 MachineMemOperand *MMO, SelectionDAG &DAG) {
28876 SDVTList VTs = DAG.getVTList(MVT::Other);
28877 SDValue Ops[] = { Chain, Val, Ptr, Mask };
28878 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
28879 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28880}
28881
28882static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
28883 SelectionDAG &DAG) {
28884 unsigned IntNo = Op.getConstantOperandVal(1);
28885 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
28886 if (!IntrData) {
28887 switch (IntNo) {
28888
28889 case Intrinsic::swift_async_context_addr: {
28890 SDLoc dl(Op);
28891 auto &MF = DAG.getMachineFunction();
28892 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
28893 if (Subtarget.is64Bit()) {
28894 MF.getFrameInfo().setFrameAddressIsTaken(true);
28895 X86FI->setHasSwiftAsyncContext(true);
28896 SDValue Chain = Op->getOperand(0);
28897 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
28898 SDValue Result =
28899 SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
28900 DAG.getTargetConstant(8, dl, MVT::i32)),
28901 0);
28902 // Return { result, chain }.
28903 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28904 CopyRBP.getValue(1));
28905 } else {
28906 // 32-bit so no special extended frame, create or reuse an existing
28907 // stack slot.
28908 if (!X86FI->getSwiftAsyncContextFrameIdx())
28909 X86FI->setSwiftAsyncContextFrameIdx(
28910 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
28911 SDValue Result =
28912 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
28913 // Return { result, chain }.
28914 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28915 Op->getOperand(0));
28916 }
28917 }
28918
28919 case llvm::Intrinsic::x86_seh_ehregnode:
28920 return MarkEHRegistrationNode(Op, DAG);
28921 case llvm::Intrinsic::x86_seh_ehguard:
28922 return MarkEHGuard(Op, DAG);
28923 case llvm::Intrinsic::x86_rdpkru: {
28924 SDLoc dl(Op);
28925 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28926 // Create a RDPKRU node and pass 0 to the ECX parameter.
28927 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
28928 DAG.getConstant(0, dl, MVT::i32));
28929 }
28930 case llvm::Intrinsic::x86_wrpkru: {
28931 SDLoc dl(Op);
28932 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
28933 // to the EDX and ECX parameters.
28934 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
28935 Op.getOperand(0), Op.getOperand(2),
28936 DAG.getConstant(0, dl, MVT::i32),
28937 DAG.getConstant(0, dl, MVT::i32));
28938 }
28939 case llvm::Intrinsic::asan_check_memaccess: {
28940 // Mark this as adjustsStack because it will be lowered to a call.
28941 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
28942 // Don't do anything here, we will expand these intrinsics out later.
28943 return Op;
28944 }
28945 case llvm::Intrinsic::x86_flags_read_u32:
28946 case llvm::Intrinsic::x86_flags_read_u64:
28947 case llvm::Intrinsic::x86_flags_write_u32:
28948 case llvm::Intrinsic::x86_flags_write_u64: {
28949 // We need a frame pointer because this will get lowered to a PUSH/POP
28950 // sequence.
28951 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28952 MFI.setHasCopyImplyingStackAdjustment(true);
28953 // Don't do anything here, we will expand these intrinsics out later
28954 // during FinalizeISel in EmitInstrWithCustomInserter.
28955 return Op;
28956 }
28957 case Intrinsic::x86_lwpins32:
28958 case Intrinsic::x86_lwpins64:
28959 case Intrinsic::x86_umwait:
28960 case Intrinsic::x86_tpause: {
28961 SDLoc dl(Op);
28962 SDValue Chain = Op->getOperand(0);
28963 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28964 unsigned Opcode;
28965
28966 switch (IntNo) {
28967 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28967)
;
28968 case Intrinsic::x86_umwait:
28969 Opcode = X86ISD::UMWAIT;
28970 break;
28971 case Intrinsic::x86_tpause:
28972 Opcode = X86ISD::TPAUSE;
28973 break;
28974 case Intrinsic::x86_lwpins32:
28975 case Intrinsic::x86_lwpins64:
28976 Opcode = X86ISD::LWPINS;
28977 break;
28978 }
28979
28980 SDValue Operation =
28981 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
28982 Op->getOperand(3), Op->getOperand(4));
28983 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
28984 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28985 Operation.getValue(1));
28986 }
28987 case Intrinsic::x86_enqcmd:
28988 case Intrinsic::x86_enqcmds: {
28989 SDLoc dl(Op);
28990 SDValue Chain = Op.getOperand(0);
28991 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28992 unsigned Opcode;
28993 switch (IntNo) {
28994 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28994)
;
28995 case Intrinsic::x86_enqcmd:
28996 Opcode = X86ISD::ENQCMD;
28997 break;
28998 case Intrinsic::x86_enqcmds:
28999 Opcode = X86ISD::ENQCMDS;
29000 break;
29001 }
29002 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
29003 Op.getOperand(3));
29004 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
29005 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29006 Operation.getValue(1));
29007 }
29008 case Intrinsic::x86_aesenc128kl:
29009 case Intrinsic::x86_aesdec128kl:
29010 case Intrinsic::x86_aesenc256kl:
29011 case Intrinsic::x86_aesdec256kl: {
29012 SDLoc DL(Op);
29013 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
29014 SDValue Chain = Op.getOperand(0);
29015 unsigned Opcode;
29016
29017 switch (IntNo) {
29018 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29018)
;
29019 case Intrinsic::x86_aesenc128kl:
29020 Opcode = X86ISD::AESENC128KL;
29021 break;
29022 case Intrinsic::x86_aesdec128kl:
29023 Opcode = X86ISD::AESDEC128KL;
29024 break;
29025 case Intrinsic::x86_aesenc256kl:
29026 Opcode = X86ISD::AESENC256KL;
29027 break;
29028 case Intrinsic::x86_aesdec256kl:
29029 Opcode = X86ISD::AESDEC256KL;
29030 break;
29031 }
29032
29033 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
29034 MachineMemOperand *MMO = MemIntr->getMemOperand();
29035 EVT MemVT = MemIntr->getMemoryVT();
29036 SDValue Operation = DAG.getMemIntrinsicNode(
29037 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
29038 MMO);
29039 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
29040
29041 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29042 {ZF, Operation.getValue(0), Operation.getValue(2)});
29043 }
29044 case Intrinsic::x86_aesencwide128kl:
29045 case Intrinsic::x86_aesdecwide128kl:
29046 case Intrinsic::x86_aesencwide256kl:
29047 case Intrinsic::x86_aesdecwide256kl: {
29048 SDLoc DL(Op);
29049 SDVTList VTs = DAG.getVTList(
29050 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
29051 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
29052 SDValue Chain = Op.getOperand(0);
29053 unsigned Opcode;
29054
29055 switch (IntNo) {
29056 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29056)
;
29057 case Intrinsic::x86_aesencwide128kl:
29058 Opcode = X86ISD::AESENCWIDE128KL;
29059 break;
29060 case Intrinsic::x86_aesdecwide128kl:
29061 Opcode = X86ISD::AESDECWIDE128KL;
29062 break;
29063 case Intrinsic::x86_aesencwide256kl:
29064 Opcode = X86ISD::AESENCWIDE256KL;
29065 break;
29066 case Intrinsic::x86_aesdecwide256kl:
29067 Opcode = X86ISD::AESDECWIDE256KL;
29068 break;
29069 }
29070
29071 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
29072 MachineMemOperand *MMO = MemIntr->getMemOperand();
29073 EVT MemVT = MemIntr->getMemoryVT();
29074 SDValue Operation = DAG.getMemIntrinsicNode(
29075 Opcode, DL, VTs,
29076 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
29077 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
29078 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
29079 MemVT, MMO);
29080 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
29081
29082 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29083 {ZF, Operation.getValue(1), Operation.getValue(2),
29084 Operation.getValue(3), Operation.getValue(4),
29085 Operation.getValue(5), Operation.getValue(6),
29086 Operation.getValue(7), Operation.getValue(8),
29087 Operation.getValue(9)});
29088 }
29089 case Intrinsic::x86_testui: {
29090 SDLoc dl(Op);
29091 SDValue Chain = Op.getOperand(0);
29092 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
29093 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
29094 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
29095 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29096 Operation.getValue(1));
29097 }
29098 case Intrinsic::x86_atomic_bts_rm:
29099 case Intrinsic::x86_atomic_btc_rm:
29100 case Intrinsic::x86_atomic_btr_rm: {
29101 SDLoc DL(Op);
29102 MVT VT = Op.getSimpleValueType();
29103 SDValue Chain = Op.getOperand(0);
29104 SDValue Op1 = Op.getOperand(2);
29105 SDValue Op2 = Op.getOperand(3);
29106 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
29107 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
29108 : X86ISD::LBTR_RM;
29109 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29110 SDValue Res =
29111 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29112 {Chain, Op1, Op2}, VT, MMO);
29113 Chain = Res.getValue(1);
29114 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29115 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29116 }
29117 case Intrinsic::x86_atomic_bts:
29118 case Intrinsic::x86_atomic_btc:
29119 case Intrinsic::x86_atomic_btr: {
29120 SDLoc DL(Op);
29121 MVT VT = Op.getSimpleValueType();
29122 SDValue Chain = Op.getOperand(0);
29123 SDValue Op1 = Op.getOperand(2);
29124 SDValue Op2 = Op.getOperand(3);
29125 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
29126 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
29127 : X86ISD::LBTR;
29128 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
29129 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29130 SDValue Res =
29131 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29132 {Chain, Op1, Op2, Size}, VT, MMO);
29133 Chain = Res.getValue(1);
29134 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29135 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
29136 if (Imm)
29137 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
29138 DAG.getShiftAmountConstant(Imm, VT, DL));
29139 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29140 }
29141 case Intrinsic::x86_cmpccxadd32:
29142 case Intrinsic::x86_cmpccxadd64: {
29143 SDLoc DL(Op);
29144 SDValue Chain = Op.getOperand(0);
29145 SDValue Addr = Op.getOperand(2);
29146 SDValue Src1 = Op.getOperand(3);
29147 SDValue Src2 = Op.getOperand(4);
29148 SDValue CC = Op.getOperand(5);
29149 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29150 SDValue Operation = DAG.getMemIntrinsicNode(
29151 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
29152 MVT::i32, MMO);
29153 return Operation;
29154 }
29155 case Intrinsic::x86_aadd32:
29156 case Intrinsic::x86_aadd64:
29157 case Intrinsic::x86_aand32:
29158 case Intrinsic::x86_aand64:
29159 case Intrinsic::x86_aor32:
29160 case Intrinsic::x86_aor64:
29161 case Intrinsic::x86_axor32:
29162 case Intrinsic::x86_axor64: {
29163 SDLoc DL(Op);
29164 SDValue Chain = Op.getOperand(0);
29165 SDValue Op1 = Op.getOperand(2);
29166 SDValue Op2 = Op.getOperand(3);
29167 MVT VT = Op2.getSimpleValueType();
29168 unsigned Opc = 0;
29169 switch (IntNo) {
29170 default:
29171 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29171)
;
29172 case Intrinsic::x86_aadd32:
29173 case Intrinsic::x86_aadd64:
29174 Opc = X86ISD::AADD;
29175 break;
29176 case Intrinsic::x86_aand32:
29177 case Intrinsic::x86_aand64:
29178 Opc = X86ISD::AAND;
29179 break;
29180 case Intrinsic::x86_aor32:
29181 case Intrinsic::x86_aor64:
29182 Opc = X86ISD::AOR;
29183 break;
29184 case Intrinsic::x86_axor32:
29185 case Intrinsic::x86_axor64:
29186 Opc = X86ISD::AXOR;
29187 break;
29188 }
29189 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
29190 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
29191 {Chain, Op1, Op2}, VT, MMO);
29192 }
29193 case Intrinsic::x86_atomic_add_cc:
29194 case Intrinsic::x86_atomic_sub_cc:
29195 case Intrinsic::x86_atomic_or_cc:
29196 case Intrinsic::x86_atomic_and_cc:
29197 case Intrinsic::x86_atomic_xor_cc: {
29198 SDLoc DL(Op);
29199 SDValue Chain = Op.getOperand(0);
29200 SDValue Op1 = Op.getOperand(2);
29201 SDValue Op2 = Op.getOperand(3);
29202 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
29203 MVT VT = Op2.getSimpleValueType();
29204 unsigned Opc = 0;
29205 switch (IntNo) {
29206 default:
29207 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29207)
;
29208 case Intrinsic::x86_atomic_add_cc:
29209 Opc = X86ISD::LADD;
29210 break;
29211 case Intrinsic::x86_atomic_sub_cc:
29212 Opc = X86ISD::LSUB;
29213 break;
29214 case Intrinsic::x86_atomic_or_cc:
29215 Opc = X86ISD::LOR;
29216 break;
29217 case Intrinsic::x86_atomic_and_cc:
29218 Opc = X86ISD::LAND;
29219 break;
29220 case Intrinsic::x86_atomic_xor_cc:
29221 Opc = X86ISD::LXOR;
29222 break;
29223 }
29224 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29225 SDValue LockArith =
29226 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29227 {Chain, Op1, Op2}, VT, MMO);
29228 Chain = LockArith.getValue(1);
29229 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
29230 }
29231 }
29232 return SDValue();
29233 }
29234
29235 SDLoc dl(Op);
29236 switch(IntrData->Type) {
29237 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29237)
;
29238 case RDSEED:
29239 case RDRAND: {
29240 // Emit the node with the right value type.
29241 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
29242 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29243
29244 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
29245 // Otherwise return the value from Rand, which is always 0, casted to i32.
29246 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
29247 DAG.getConstant(1, dl, Op->getValueType(1)),
29248 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
29249 SDValue(Result.getNode(), 1)};
29250 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
29251
29252 // Return { result, isValid, chain }.
29253 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
29254 SDValue(Result.getNode(), 2));
29255 }
29256 case GATHER_AVX2: {
29257 SDValue Chain = Op.getOperand(0);
29258 SDValue Src = Op.getOperand(2);
29259 SDValue Base = Op.getOperand(3);
29260 SDValue Index = Op.getOperand(4);
29261 SDValue Mask = Op.getOperand(5);
29262 SDValue Scale = Op.getOperand(6);
29263 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29264 Scale, Chain, Subtarget);
29265 }
29266 case GATHER: {
29267 //gather(v1, mask, index, base, scale);
29268 SDValue Chain = Op.getOperand(0);
29269 SDValue Src = Op.getOperand(2);
29270 SDValue Base = Op.getOperand(3);
29271 SDValue Index = Op.getOperand(4);
29272 SDValue Mask = Op.getOperand(5);
29273 SDValue Scale = Op.getOperand(6);
29274 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
29275 Chain, Subtarget);
29276 }
29277 case SCATTER: {
29278 //scatter(base, mask, index, v1, scale);
29279 SDValue Chain = Op.getOperand(0);
29280 SDValue Base = Op.getOperand(2);
29281 SDValue Mask = Op.getOperand(3);
29282 SDValue Index = Op.getOperand(4);
29283 SDValue Src = Op.getOperand(5);
29284 SDValue Scale = Op.getOperand(6);
29285 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29286 Scale, Chain, Subtarget);
29287 }
29288 case PREFETCH: {
29289 const APInt &HintVal = Op.getConstantOperandAPInt(6);
29290 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__
__PRETTY_FUNCTION__))
29291 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__
__PRETTY_FUNCTION__))
;
29292 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
29293 SDValue Chain = Op.getOperand(0);
29294 SDValue Mask = Op.getOperand(2);
29295 SDValue Index = Op.getOperand(3);
29296 SDValue Base = Op.getOperand(4);
29297 SDValue Scale = Op.getOperand(5);
29298 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
29299 Subtarget);
29300 }
29301 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
29302 case RDTSC: {
29303 SmallVector<SDValue, 2> Results;
29304 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
29305 Results);
29306 return DAG.getMergeValues(Results, dl);
29307 }
29308 // Read Performance Monitoring Counters.
29309 case RDPMC:
29310 // Read Processor Register.
29311 case RDPRU:
29312 // GetExtended Control Register.
29313 case XGETBV: {
29314 SmallVector<SDValue, 2> Results;
29315
29316 // RDPMC uses ECX to select the index of the performance counter to read.
29317 // RDPRU uses ECX to select the processor register to read.
29318 // XGETBV uses ECX to select the index of the XCR register to return.
29319 // The result is stored into registers EDX:EAX.
29320 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
29321 Subtarget, Results);
29322 return DAG.getMergeValues(Results, dl);
29323 }
29324 // XTEST intrinsics.
29325 case XTEST: {
29326 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
29327 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29328
29329 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
29330 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
29331 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
29332 Ret, SDValue(InTrans.getNode(), 1));
29333 }
29334 case TRUNCATE_TO_MEM_VI8:
29335 case TRUNCATE_TO_MEM_VI16:
29336 case TRUNCATE_TO_MEM_VI32: {
29337 SDValue Mask = Op.getOperand(4);
29338 SDValue DataToTruncate = Op.getOperand(3);
29339 SDValue Addr = Op.getOperand(2);
29340 SDValue Chain = Op.getOperand(0);
29341
29342 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
29343 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29343, __extension__
__PRETTY_FUNCTION__))
;
29344
29345 EVT MemVT = MemIntr->getMemoryVT();
29346
29347 uint16_t TruncationOp = IntrData->Opc0;
29348 switch (TruncationOp) {
29349 case X86ISD::VTRUNC: {
29350 if (isAllOnesConstant(Mask)) // return just a truncate store
29351 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
29352 MemIntr->getMemOperand());
29353
29354 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29355 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29356 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
29357
29358 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
29359 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
29360 true /* truncating */);
29361 }
29362 case X86ISD::VTRUNCUS:
29363 case X86ISD::VTRUNCS: {
29364 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
29365 if (isAllOnesConstant(Mask))
29366 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
29367 MemIntr->getMemOperand(), DAG);
29368
29369 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29370 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29371
29372 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
29373 VMask, MemVT, MemIntr->getMemOperand(), DAG);
29374 }
29375 default:
29376 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29376)
;
29377 }
29378 }
29379 }
29380}
29381
29382SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
29383 SelectionDAG &DAG) const {
29384 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
29385 MFI.setReturnAddressIsTaken(true);
29386
29387 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
29388 return SDValue();
29389
29390 unsigned Depth = Op.getConstantOperandVal(0);
29391 SDLoc dl(Op);
29392 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29393
29394 if (Depth > 0) {
29395 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
29396 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29397 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
29398 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
29399 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
29400 MachinePointerInfo());
29401 }
29402
29403 // Just load the return address.
29404 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
29405 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
29406 MachinePointerInfo());
29407}
29408
29409SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
29410 SelectionDAG &DAG) const {
29411 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
29412 return getReturnAddressFrameIndex(DAG);
29413}
29414
29415SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
29416 MachineFunction &MF = DAG.getMachineFunction();
29417 MachineFrameInfo &MFI = MF.getFrameInfo();
29418 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
29419 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29420 EVT VT = Op.getValueType();
29421
29422 MFI.setFrameAddressIsTaken(true);
29423
29424 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
29425 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
29426 // is not possible to crawl up the stack without looking at the unwind codes
29427 // simultaneously.
29428 int FrameAddrIndex = FuncInfo->getFAIndex();
29429 if (!FrameAddrIndex) {
29430 // Set up a frame object for the return address.
29431 unsigned SlotSize = RegInfo->getSlotSize();
29432 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
29433 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
29434 FuncInfo->setFAIndex(FrameAddrIndex);
29435 }
29436 return DAG.getFrameIndex(FrameAddrIndex, VT);
29437 }
29438
29439 unsigned FrameReg =
29440 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
29441 SDLoc dl(Op); // FIXME probably not meaningful
29442 unsigned Depth = Op.getConstantOperandVal(0);
29443 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))
29444 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))
29445 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))
;
29446 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
29447 while (Depth--)
29448 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
29449 MachinePointerInfo());
29450 return FrameAddr;
29451}
29452
29453// FIXME? Maybe this could be a TableGen attribute on some registers and
29454// this table could be generated automatically from RegInfo.
29455Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
29456 const MachineFunction &MF) const {
29457 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
29458
29459 Register Reg = StringSwitch<unsigned>(RegName)
29460 .Case("esp", X86::ESP)
29461 .Case("rsp", X86::RSP)
29462 .Case("ebp", X86::EBP)
29463 .Case("rbp", X86::RBP)
29464 .Default(0);
29465
29466 if (Reg == X86::EBP || Reg == X86::RBP) {
29467 if (!TFI.hasFP(MF))
29468 report_fatal_error("register " + StringRef(RegName) +
29469 " is allocatable: function has no frame pointer");
29470#ifndef NDEBUG
29471 else {
29472 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29473 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
29474 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__
__PRETTY_FUNCTION__))
29475 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__
__PRETTY_FUNCTION__))
;
29476 }
29477#endif
29478 }
29479
29480 if (Reg)
29481 return Reg;
29482
29483 report_fatal_error("Invalid register name global variable");
29484}
29485
29486SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
29487 SelectionDAG &DAG) const {
29488 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29489 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
29490}
29491
29492Register X86TargetLowering::getExceptionPointerRegister(
29493 const Constant *PersonalityFn) const {
29494 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
29495 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29496
29497 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
29498}
29499
29500Register X86TargetLowering::getExceptionSelectorRegister(
29501 const Constant *PersonalityFn) const {
29502 // Funclet personalities don't use selectors (the runtime does the selection).
29503 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
29504 return X86::NoRegister;
29505 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29506}
29507
29508bool X86TargetLowering::needsFixedCatchObjects() const {
29509 return Subtarget.isTargetWin64();
29510}
29511
29512SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
29513 SDValue Chain = Op.getOperand(0);
29514 SDValue Offset = Op.getOperand(1);
29515 SDValue Handler = Op.getOperand(2);
29516 SDLoc dl (Op);
29517
29518 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29519 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29520 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
29521 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))
29522 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))
29523 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))
;
29524 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
29525 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
29526
29527 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
29528 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
29529 dl));
29530 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
29531 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
29532 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
29533
29534 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
29535 DAG.getRegister(StoreAddrReg, PtrVT));
29536}
29537
29538SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
29539 SelectionDAG &DAG) const {
29540 SDLoc DL(Op);
29541 // If the subtarget is not 64bit, we may need the global base reg
29542 // after isel expand pseudo, i.e., after CGBR pass ran.
29543 // Therefore, ask for the GlobalBaseReg now, so that the pass
29544 // inserts the code for us in case we need it.
29545 // Otherwise, we will end up in a situation where we will
29546 // reference a virtual register that is not defined!
29547 if (!Subtarget.is64Bit()) {
29548 const X86InstrInfo *TII = Subtarget.getInstrInfo();
29549 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
29550 }
29551 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
29552 DAG.getVTList(MVT::i32, MVT::Other),
29553 Op.getOperand(0), Op.getOperand(1));
29554}
29555
29556SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
29557 SelectionDAG &DAG) const {
29558 SDLoc DL(Op);
29559 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
29560 Op.getOperand(0), Op.getOperand(1));
29561}
29562
29563SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
29564 SelectionDAG &DAG) const {
29565 SDLoc DL(Op);
29566 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
29567 Op.getOperand(0));
29568}
29569
29570static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
29571 return Op.getOperand(0);
29572}
29573
29574SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
29575 SelectionDAG &DAG) const {
29576 SDValue Root = Op.getOperand(0);
29577 SDValue Trmp = Op.getOperand(1); // trampoline
29578 SDValue FPtr = Op.getOperand(2); // nested function
29579 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
29580 SDLoc dl (Op);
29581
29582 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
29583 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29584
29585 if (Subtarget.is64Bit()) {
29586 SDValue OutChains[6];
29587
29588 // Large code-model.
29589 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
29590 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
29591
29592 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
29593 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
29594
29595 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
29596
29597 // Load the pointer to the nested function into R11.
29598 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
29599 SDValue Addr = Trmp;
29600 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29601 Addr, MachinePointerInfo(TrmpAddr));
29602
29603 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29604 DAG.getConstant(2, dl, MVT::i64));
29605 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
29606 MachinePointerInfo(TrmpAddr, 2), Align(2));
29607
29608 // Load the 'nest' parameter value into R10.
29609 // R10 is specified in X86CallingConv.td
29610 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
29611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29612 DAG.getConstant(10, dl, MVT::i64));
29613 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29614 Addr, MachinePointerInfo(TrmpAddr, 10));
29615
29616 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29617 DAG.getConstant(12, dl, MVT::i64));
29618 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
29619 MachinePointerInfo(TrmpAddr, 12), Align(2));
29620
29621 // Jump to the nested function.
29622 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
29623 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29624 DAG.getConstant(20, dl, MVT::i64));
29625 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29626 Addr, MachinePointerInfo(TrmpAddr, 20));
29627
29628 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
29629 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29630 DAG.getConstant(22, dl, MVT::i64));
29631 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
29632 Addr, MachinePointerInfo(TrmpAddr, 22));
29633
29634 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29635 } else {
29636 const Function *Func =
29637 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
29638 CallingConv::ID CC = Func->getCallingConv();
29639 unsigned NestReg;
29640
29641 switch (CC) {
29642 default:
29643 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29643)
;
29644 case CallingConv::C:
29645 case CallingConv::X86_StdCall: {
29646 // Pass 'nest' parameter in ECX.
29647 // Must be kept in sync with X86CallingConv.td
29648 NestReg = X86::ECX;
29649
29650 // Check that ECX wasn't needed by an 'inreg' parameter.
29651 FunctionType *FTy = Func->getFunctionType();
29652 const AttributeList &Attrs = Func->getAttributes();
29653
29654 if (!Attrs.isEmpty() && !Func->isVarArg()) {
29655 unsigned InRegCount = 0;
29656 unsigned Idx = 0;
29657
29658 for (FunctionType::param_iterator I = FTy->param_begin(),
29659 E = FTy->param_end(); I != E; ++I, ++Idx)
29660 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
29661 const DataLayout &DL = DAG.getDataLayout();
29662 // FIXME: should only count parameters that are lowered to integers.
29663 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
29664 }
29665
29666 if (InRegCount > 2) {
29667 report_fatal_error("Nest register in use - reduce number of inreg"
29668 " parameters!");
29669 }
29670 }
29671 break;
29672 }
29673 case CallingConv::X86_FastCall:
29674 case CallingConv::X86_ThisCall:
29675 case CallingConv::Fast:
29676 case CallingConv::Tail:
29677 case CallingConv::SwiftTail:
29678 // Pass 'nest' parameter in EAX.
29679 // Must be kept in sync with X86CallingConv.td
29680 NestReg = X86::EAX;
29681 break;
29682 }
29683
29684 SDValue OutChains[4];
29685 SDValue Addr, Disp;
29686
29687 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29688 DAG.getConstant(10, dl, MVT::i32));
29689 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
29690
29691 // This is storing the opcode for MOV32ri.
29692 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
29693 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
29694 OutChains[0] =
29695 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
29696 Trmp, MachinePointerInfo(TrmpAddr));
29697
29698 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29699 DAG.getConstant(1, dl, MVT::i32));
29700 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
29701 MachinePointerInfo(TrmpAddr, 1), Align(1));
29702
29703 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
29704 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29705 DAG.getConstant(5, dl, MVT::i32));
29706 OutChains[2] =
29707 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
29708 MachinePointerInfo(TrmpAddr, 5), Align(1));
29709
29710 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29711 DAG.getConstant(6, dl, MVT::i32));
29712 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
29713 MachinePointerInfo(TrmpAddr, 6), Align(1));
29714
29715 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29716 }
29717}
29718
29719SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
29720 SelectionDAG &DAG) const {
29721 /*
29722 The rounding mode is in bits 11:10 of FPSR, and has the following
29723 settings:
29724 00 Round to nearest
29725 01 Round to -inf
29726 10 Round to +inf
29727 11 Round to 0
29728
29729 GET_ROUNDING, on the other hand, expects the following:
29730 -1 Undefined
29731 0 Round to 0
29732 1 Round to nearest
29733 2 Round to +inf
29734 3 Round to -inf
29735
29736 To perform the conversion, we use a packed lookup table of the four 2-bit
29737 values that we can index by FPSP[11:10]
29738 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
29739
29740 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
29741 */
29742
29743 MachineFunction &MF = DAG.getMachineFunction();
29744 MVT VT = Op.getSimpleValueType();
29745 SDLoc DL(Op);
29746
29747 // Save FP Control Word to stack slot
29748 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
29749 SDValue StackSlot =
29750 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
29751
29752 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
29753
29754 SDValue Chain = Op.getOperand(0);
29755 SDValue Ops[] = {Chain, StackSlot};
29756 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
29757 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
29758 Align(2), MachineMemOperand::MOStore);
29759
29760 // Load FP Control Word from stack slot
29761 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
29762 Chain = CWD.getValue(1);
29763
29764 // Mask and turn the control bits into a shift for the lookup table.
29765 SDValue Shift =
29766 DAG.getNode(ISD::SRL, DL, MVT::i16,
29767 DAG.getNode(ISD::AND, DL, MVT::i16,
29768 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
29769 DAG.getConstant(9, DL, MVT::i8));
29770 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
29771
29772 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
29773 SDValue RetVal =
29774 DAG.getNode(ISD::AND, DL, MVT::i32,
29775 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
29776 DAG.getConstant(3, DL, MVT::i32));
29777
29778 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
29779
29780 return DAG.getMergeValues({RetVal, Chain}, DL);
29781}
29782
29783SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
29784 SelectionDAG &DAG) const {
29785 MachineFunction &MF = DAG.getMachineFunction();
29786 SDLoc DL(Op);
29787 SDValue Chain = Op.getNode()->getOperand(0);
29788
29789 // FP control word may be set only from data in memory. So we need to allocate
29790 // stack space to save/load FP control word.
29791 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
29792 SDValue StackSlot =
29793 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
29794 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
29795 MachineMemOperand *MMO =
29796 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
29797
29798 // Store FP control word into memory.
29799 SDValue Ops[] = {Chain, StackSlot};
29800 Chain = DAG.getMemIntrinsicNode(
29801 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
29802
29803 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
29804 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
29805 Chain = CWD.getValue(1);
29806 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
29807 DAG.getConstant(0xf3ff, DL, MVT::i16));
29808
29809 // Calculate new rounding mode.
29810 SDValue NewRM = Op.getNode()->getOperand(1);
29811 SDValue RMBits;
29812 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
29813 uint64_t RM = CVal->getZExtValue();
29814 int FieldVal;
29815 switch (static_cast<RoundingMode>(RM)) {
29816 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
29817 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
29818 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
29819 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
29820 default:
29821 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29821)
;
29822 }
29823 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
29824 } else {
29825 // Need to convert argument into bits of control word:
29826 // 0 Round to 0 -> 11
29827 // 1 Round to nearest -> 00
29828 // 2 Round to +inf -> 10
29829 // 3 Round to -inf -> 01
29830 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
29831 // To make the conversion, put all these values into a value 0xc9 and shift
29832 // it left depending on the rounding mode:
29833 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
29834 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
29835 // ...
29836 // (0xc9 << (2 * NewRM + 4)) & 0xc00
29837 SDValue ShiftValue =
29838 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
29839 DAG.getNode(ISD::ADD, DL, MVT::i32,
29840 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
29841 DAG.getConstant(1, DL, MVT::i8)),
29842 DAG.getConstant(4, DL, MVT::i32)));
29843 SDValue Shifted =
29844 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
29845 ShiftValue);
29846 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
29847 DAG.getConstant(0xc00, DL, MVT::i16));
29848 }
29849
29850 // Update rounding mode bits and store the new FP Control Word into stack.
29851 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
29852 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
29853
29854 // Load FP control word from the slot.
29855 SDValue OpsLD[] = {Chain, StackSlot};
29856 MachineMemOperand *MMOL =
29857 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
29858 Chain = DAG.getMemIntrinsicNode(
29859 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
29860
29861 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
29862 // same way but in bits 14:13.
29863 if (Subtarget.hasSSE1()) {
29864 // Store MXCSR into memory.
29865 Chain = DAG.getNode(
29866 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29867 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
29868 StackSlot);
29869
29870 // Load MXCSR from stack slot and clear RM field (bits 14:13).
29871 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
29872 Chain = CWD.getValue(1);
29873 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
29874 DAG.getConstant(0xffff9fff, DL, MVT::i32));
29875
29876 // Shift X87 RM bits from 11:10 to 14:13.
29877 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
29878 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
29879 DAG.getConstant(3, DL, MVT::i8));
29880
29881 // Update rounding mode bits and store the new FP Control Word into stack.
29882 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
29883 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
29884
29885 // Load MXCSR from the slot.
29886 Chain = DAG.getNode(
29887 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29888 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
29889 StackSlot);
29890 }
29891
29892 return Chain;
29893}
29894
29895/// Lower a vector CTLZ using native supported vector CTLZ instruction.
29896//
29897// i8/i16 vector implemented using dword LZCNT vector instruction
29898// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
29899// split the vector, perform operation on it's Lo a Hi part and
29900// concatenate the results.
29901static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
29902 const X86Subtarget &Subtarget) {
29903 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29903, __extension__ __PRETTY_FUNCTION__))
;
29904 SDLoc dl(Op);
29905 MVT VT = Op.getSimpleValueType();
29906 MVT EltVT = VT.getVectorElementType();
29907 unsigned NumElems = VT.getVectorNumElements();
29908
29909 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__
__PRETTY_FUNCTION__))
29910 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__
__PRETTY_FUNCTION__))
;
29911
29912 // Split vector, it's Lo and Hi parts will be handled in next iteration.
29913 if (NumElems > 16 ||
29914 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
29915 return splitVectorIntUnary(Op, DAG);
29916
29917 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29918 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__
__PRETTY_FUNCTION__))
29919 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__
__PRETTY_FUNCTION__))
;
29920
29921 // Use native supported vector instruction vplzcntd.
29922 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
29923 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
29924 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
29925 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
29926
29927 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
29928}
29929
29930// Lower CTLZ using a PSHUFB lookup table implementation.
29931static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
29932 const X86Subtarget &Subtarget,
29933 SelectionDAG &DAG) {
29934 MVT VT = Op.getSimpleValueType();
29935 int NumElts = VT.getVectorNumElements();
29936 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
29937 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
29938
29939 // Per-nibble leading zero PSHUFB lookup table.
29940 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
29941 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
29942 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
29943 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
29944
29945 SmallVector<SDValue, 64> LUTVec;
29946 for (int i = 0; i < NumBytes; ++i)
29947 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29948 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29949
29950 // Begin by bitcasting the input to byte vector, then split those bytes
29951 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
29952 // If the hi input nibble is zero then we add both results together, otherwise
29953 // we just take the hi result (by masking the lo result to zero before the
29954 // add).
29955 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29956 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29957
29958 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29959 SDValue Lo = Op0;
29960 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29961 SDValue HiZ;
29962 if (CurrVT.is512BitVector()) {
29963 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29964 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29965 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29966 } else {
29967 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29968 }
29969
29970 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29971 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29972 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29973 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29974
29975 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29976 // of the current vector width in the same way we did for the nibbles.
29977 // If the upper half of the input element is zero then add the halves'
29978 // leading zero counts together, otherwise just use the upper half's.
29979 // Double the width of the result until we are at target width.
29980 while (CurrVT != VT) {
29981 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29982 int CurrNumElts = CurrVT.getVectorNumElements();
29983 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29984 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29985 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29986
29987 // Check if the upper half of the input element is zero.
29988 if (CurrVT.is512BitVector()) {
29989 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29990 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29991 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29992 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29993 } else {
29994 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29995 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29996 }
29997 HiZ = DAG.getBitcast(NextVT, HiZ);
29998
29999 // Move the upper/lower halves to the lower bits as we'll be extending to
30000 // NextVT. Mask the lower result to zero if HiZ is true and add the results
30001 // together.
30002 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
30003 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
30004 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
30005 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
30006 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
30007 CurrVT = NextVT;
30008 }
30009
30010 return Res;
30011}
30012
30013static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
30014 const X86Subtarget &Subtarget,
30015 SelectionDAG &DAG) {
30016 MVT VT = Op.getSimpleValueType();
30017
30018 if (Subtarget.hasCDI() &&
30019 // vXi8 vectors need to be promoted to 512-bits for vXi32.
30020 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
30021 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
30022
30023 // Decompose 256-bit ops into smaller 128-bit ops.
30024 if (VT.is256BitVector() && !Subtarget.hasInt256())
30025 return splitVectorIntUnary(Op, DAG);
30026
30027 // Decompose 512-bit ops into smaller 256-bit ops.
30028 if (VT.is512BitVector() && !Subtarget.hasBWI())
30029 return splitVectorIntUnary(Op, DAG);
30030
30031 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30031, __extension__
__PRETTY_FUNCTION__))
;
30032 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
30033}
30034
30035static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
30036 SelectionDAG &DAG) {
30037 MVT VT = Op.getSimpleValueType();
30038 MVT OpVT = VT;
30039 unsigned NumBits = VT.getSizeInBits();
30040 SDLoc dl(Op);
30041 unsigned Opc = Op.getOpcode();
30042
30043 if (VT.isVector())
30044 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
30045
30046 Op = Op.getOperand(0);
30047 if (VT == MVT::i8) {
30048 // Zero extend to i32 since there is not an i8 bsr.
30049 OpVT = MVT::i32;
30050 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
30051 }
30052
30053 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
30054 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
30055 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
30056
30057 if (Opc == ISD::CTLZ) {
30058 // If src is zero (i.e. bsr sets ZF), returns NumBits.
30059 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
30060 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
30061 Op.getValue(1)};
30062 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
30063 }
30064
30065 // Finally xor with NumBits-1.
30066 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
30067 DAG.getConstant(NumBits - 1, dl, OpVT));
30068
30069 if (VT == MVT::i8)
30070 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
30071 return Op;
30072}
30073
30074static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
30075 SelectionDAG &DAG) {
30076 MVT VT = Op.getSimpleValueType();
30077 unsigned NumBits = VT.getScalarSizeInBits();
30078 SDValue N0 = Op.getOperand(0);
30079 SDLoc dl(Op);
30080
30081 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__
__PRETTY_FUNCTION__))
30082 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__
__PRETTY_FUNCTION__))
;
30083
30084 // Issue a bsf (scan bits forward) which also sets EFLAGS.
30085 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30086 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
30087
30088 // If src is known never zero we can skip the CMOV.
30089 if (DAG.isKnownNeverZero(N0))
30090 return Op;
30091
30092 // If src is zero (i.e. bsf sets ZF), returns NumBits.
30093 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
30094 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
30095 Op.getValue(1)};
30096 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
30097}
30098
30099static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
30100 const X86Subtarget &Subtarget) {
30101 MVT VT = Op.getSimpleValueType();
30102 if (VT == MVT::i16 || VT == MVT::i32)
30103 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
30104
30105 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30106 return splitVectorIntBinary(Op, DAG);
30107
30108 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))
30109 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))
30110 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))
;
30111 return splitVectorIntBinary(Op, DAG);
30112}
30113
30114static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
30115 const X86Subtarget &Subtarget) {
30116 MVT VT = Op.getSimpleValueType();
30117 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
30118 unsigned Opcode = Op.getOpcode();
30119 SDLoc DL(Op);
30120
30121 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
30122 (VT.is256BitVector() && !Subtarget.hasInt256())) {
30123 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__
__PRETTY_FUNCTION__))
30124 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__
__PRETTY_FUNCTION__))
;
30125 return splitVectorIntBinary(Op, DAG);
30126 }
30127
30128 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
30129 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30130 EVT SetCCResultType =
30131 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30132
30133 unsigned BitWidth = VT.getScalarSizeInBits();
30134 if (Opcode == ISD::USUBSAT) {
30135 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
30136 // Handle a special-case with a bit-hack instead of cmp+select:
30137 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
30138 // If the target can use VPTERNLOG, DAGToDAG will match this as
30139 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
30140 // "broadcast" constant load.
30141 ConstantSDNode *C = isConstOrConstSplat(Y, true);
30142 if (C && C->getAPIntValue().isSignMask()) {
30143 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
30144 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
30145 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
30146 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
30147 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
30148 }
30149 }
30150 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
30151 // usubsat X, Y --> (X >u Y) ? X - Y : 0
30152 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
30153 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
30154 // TODO: Move this to DAGCombiner?
30155 if (SetCCResultType == VT &&
30156 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
30157 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
30158 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
30159 }
30160 }
30161
30162 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
30163 (!VT.isVector() || VT == MVT::v2i64)) {
30164 APInt MinVal = APInt::getSignedMinValue(BitWidth);
30165 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
30166 SDValue Zero = DAG.getConstant(0, DL, VT);
30167 SDValue Result =
30168 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
30169 DAG.getVTList(VT, SetCCResultType), X, Y);
30170 SDValue SumDiff = Result.getValue(0);
30171 SDValue Overflow = Result.getValue(1);
30172 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
30173 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
30174 SDValue SumNeg =
30175 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
30176 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
30177 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
30178 }
30179
30180 // Use default expansion.
30181 return SDValue();
30182}
30183
30184static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
30185 SelectionDAG &DAG) {
30186 MVT VT = Op.getSimpleValueType();
30187 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
30188 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30189 // 8-bit integer abs to NEG and CMOV.
30190 SDLoc DL(Op);
30191 SDValue N0 = Op.getOperand(0);
30192 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30193 DAG.getConstant(0, DL, VT), N0);
30194 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
30195 SDValue(Neg.getNode(), 1)};
30196 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
30197 }
30198
30199 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
30200 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
30201 SDLoc DL(Op);
30202 SDValue Src = Op.getOperand(0);
30203 SDValue Sub =
30204 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
30205 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
30206 }
30207
30208 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
30209 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__
__PRETTY_FUNCTION__))
30210 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__
__PRETTY_FUNCTION__))
;
30211 return splitVectorIntUnary(Op, DAG);
30212 }
30213
30214 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30215 return splitVectorIntUnary(Op, DAG);
30216
30217 // Default to expand.
30218 return SDValue();
30219}
30220
30221static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
30222 SelectionDAG &DAG) {
30223 MVT VT = Op.getSimpleValueType();
30224
30225 // For AVX1 cases, split to use legal ops.
30226 if (VT.is256BitVector() && !Subtarget.hasInt256())
30227 return splitVectorIntBinary(Op, DAG);
30228
30229 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30230 return splitVectorIntBinary(Op, DAG);
30231
30232 // Default to expand.
30233 return SDValue();
30234}
30235
30236static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
30237 SelectionDAG &DAG) {
30238 MVT VT = Op.getSimpleValueType();
30239
30240 // For AVX1 cases, split to use legal ops.
30241 if (VT.is256BitVector() && !Subtarget.hasInt256())
30242 return splitVectorIntBinary(Op, DAG);
30243
30244 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30245 return splitVectorIntBinary(Op, DAG);
30246
30247 // Default to expand.
30248 return SDValue();
30249}
30250
30251static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
30252 SelectionDAG &DAG) {
30253 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM ||
Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__
__PRETTY_FUNCTION__))
30254 "Expected FMAXIMUM or FMINIMUM opcode")(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM ||
Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__
__PRETTY_FUNCTION__))
;
30255 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30256 EVT VT = Op.getValueType();
30257 SDValue X = Op.getOperand(0);
30258 SDValue Y = Op.getOperand(1);
30259 SDLoc DL(Op);
30260 uint64_t SizeInBits = VT.getFixedSizeInBits();
30261 APInt PreferredZero = APInt::getZero(SizeInBits);
30262 EVT IVT = MVT::getIntegerVT(SizeInBits);
30263 X86ISD::NodeType MinMaxOp;
30264 if (Op.getOpcode() == ISD::FMAXIMUM) {
30265 MinMaxOp = X86ISD::FMAX;
30266 } else {
30267 PreferredZero.setSignBit();
30268 MinMaxOp = X86ISD::FMIN;
30269 }
30270 EVT SetCCType =
30271 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30272
30273 // The tables below show the expected result of Max in cases of NaN and
30274 // signed zeros.
30275 //
30276 // Y Y
30277 // Num xNaN +0 -0
30278 // --------------- ---------------
30279 // Num | Max | Y | +0 | +0 | +0 |
30280 // X --------------- X ---------------
30281 // xNaN | X | X/Y | -0 | +0 | -0 |
30282 // --------------- ---------------
30283 //
30284 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
30285 // reordering.
30286 //
30287 // We check if any of operands is NaN and return NaN. Then we check if any of
30288 // operands is zero or negative zero (for fmaximum and fminimum respectively)
30289 // to ensure the correct zero is returned.
30290 auto IsPreferredZero = [PreferredZero](SDValue Op) {
30291 Op = peekThroughBitcasts(Op);
30292 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
30293 return CstOp->getValueAPF().bitcastToAPInt() == PreferredZero;
30294 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
30295 return CstOp->getAPIntValue() == PreferredZero;
30296 return false;
30297 };
30298
30299 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
30300 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
30301 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
30302 Op->getFlags().hasNoSignedZeros() ||
30303 DAG.isKnownNeverZeroFloat(X) ||
30304 DAG.isKnownNeverZeroFloat(Y);
30305 SDValue NewX, NewY;
30306 if (IgnoreSignedZero || IsPreferredZero(Y)) {
30307 // Operands are already in right order or order does not matter.
30308 NewX = X;
30309 NewY = Y;
30310 } else if (IsPreferredZero(X)) {
30311 NewX = Y;
30312 NewY = X;
30313 } else if ((VT == MVT::f16 || Subtarget.hasDQI()) &&
30314 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
30315 if (IsXNeverNaN)
30316 std::swap(X, Y);
30317 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
30318 // xmm register.
30319 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
30320 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X);
30321 // Bits of classes:
30322 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
30323 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
30324 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
30325 DL, MVT::i32);
30326 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
30327 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
30328 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
30329 DAG.getIntPtrConstant(0, DL));
30330 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
30331 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
30332 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
30333 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
30334 } else {
30335 SDValue IsXSigned;
30336 if (Subtarget.is64Bit() || VT != MVT::f64) {
30337 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
30338 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
30339 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
30340 } else {
30341 assert(VT == MVT::f64)(static_cast <bool> (VT == MVT::f64) ? void (0) : __assert_fail
("VT == MVT::f64", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30341, __extension__ __PRETTY_FUNCTION__))
;
30342 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
30343 DAG.getConstantFP(0, DL, MVT::v2f64), X,
30344 DAG.getIntPtrConstant(0, DL));
30345 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
30346 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
30347 DAG.getIntPtrConstant(1, DL));
30348 Hi = DAG.getBitcast(MVT::i32, Hi);
30349 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
30350 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
30351 *DAG.getContext(), MVT::i32);
30352 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
30353 }
30354 if (MinMaxOp == X86ISD::FMAX) {
30355 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
30356 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
30357 } else {
30358 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
30359 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
30360 }
30361 }
30362
30363 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
30364 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
30365
30366 // If we did no ordering operands for singed zero handling and we need
30367 // to process NaN and we know that the second operand is not NaN then put
30368 // it in first operand and we will not need to post handle NaN after max/min.
30369 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
30370 std::swap(NewX, NewY);
30371
30372 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
30373
30374 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
30375 return MinMax;
30376
30377 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
30378 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
30379}
30380
30381static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
30382 SelectionDAG &DAG) {
30383 MVT VT = Op.getSimpleValueType();
30384
30385 // For AVX1 cases, split to use legal ops.
30386 if (VT.is256BitVector() && !Subtarget.hasInt256())
30387 return splitVectorIntBinary(Op, DAG);
30388
30389 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
30390 return splitVectorIntBinary(Op, DAG);
30391
30392 SDLoc dl(Op);
30393 bool IsSigned = Op.getOpcode() == ISD::ABDS;
30394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30395
30396 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
30397 if (VT.isScalarInteger()) {
30398 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
30399 MVT WideVT = MVT::getIntegerVT(WideBits);
30400 if (TLI.isTypeLegal(WideVT)) {
30401 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
30402 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
30403 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30404 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
30405 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
30406 LHS = DAG.getNode(ExtOpc, dl, WideVT, LHS);
30407 RHS = DAG.getNode(ExtOpc, dl, WideVT, RHS);
30408 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
30409 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
30410 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
30411 }
30412 }
30413
30414 // Default to expand.
30415 return SDValue();
30416}
30417
30418static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
30419 SelectionDAG &DAG) {
30420 SDLoc dl(Op);
30421 MVT VT = Op.getSimpleValueType();
30422
30423 // Decompose 256-bit ops into 128-bit ops.
30424 if (VT.is256BitVector() && !Subtarget.hasInt256())
30425 return splitVectorIntBinary(Op, DAG);
30426
30427 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30428 return splitVectorIntBinary(Op, DAG);
30429
30430 SDValue A = Op.getOperand(0);
30431 SDValue B = Op.getOperand(1);
30432
30433 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
30434 // vector pairs, multiply and truncate.
30435 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
30436 unsigned NumElts = VT.getVectorNumElements();
30437
30438 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30439 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30440 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30441 return DAG.getNode(
30442 ISD::TRUNCATE, dl, VT,
30443 DAG.getNode(ISD::MUL, dl, ExVT,
30444 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
30445 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
30446 }
30447
30448 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30449
30450 // Extract the lo/hi parts to any extend to i16.
30451 // We're going to mask off the low byte of each result element of the
30452 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
30453 // element.
30454 SDValue Undef = DAG.getUNDEF(VT);
30455 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
30456 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
30457
30458 SDValue BLo, BHi;
30459 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30460 // If the RHS is a constant, manually unpackl/unpackh.
30461 SmallVector<SDValue, 16> LoOps, HiOps;
30462 for (unsigned i = 0; i != NumElts; i += 16) {
30463 for (unsigned j = 0; j != 8; ++j) {
30464 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
30465 MVT::i16));
30466 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
30467 MVT::i16));
30468 }
30469 }
30470
30471 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30472 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30473 } else {
30474 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
30475 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
30476 }
30477
30478 // Multiply, mask the lower 8bits of the lo/hi results and pack.
30479 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
30480 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
30481 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30482 }
30483
30484 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
30485 if (VT == MVT::v4i32) {
30486 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__
__PRETTY_FUNCTION__))
30487 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__
__PRETTY_FUNCTION__))
;
30488
30489 // Extract the odd parts.
30490 static const int UnpackMask[] = { 1, -1, 3, -1 };
30491 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
30492 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
30493
30494 // Multiply the even parts.
30495 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30496 DAG.getBitcast(MVT::v2i64, A),
30497 DAG.getBitcast(MVT::v2i64, B));
30498 // Now multiply odd parts.
30499 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30500 DAG.getBitcast(MVT::v2i64, Aodds),
30501 DAG.getBitcast(MVT::v2i64, Bodds));
30502
30503 Evens = DAG.getBitcast(VT, Evens);
30504 Odds = DAG.getBitcast(VT, Odds);
30505
30506 // Merge the two vectors back together with a shuffle. This expands into 2
30507 // shuffles.
30508 static const int ShufMask[] = { 0, 4, 2, 6 };
30509 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
30510 }
30511
30512 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__))
30513 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__))
;
30514 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30514, __extension__
__PRETTY_FUNCTION__))
;
30515
30516 // Ahi = psrlqi(a, 32);
30517 // Bhi = psrlqi(b, 32);
30518 //
30519 // AloBlo = pmuludq(a, b);
30520 // AloBhi = pmuludq(a, Bhi);
30521 // AhiBlo = pmuludq(Ahi, b);
30522 //
30523 // Hi = psllqi(AloBhi + AhiBlo, 32);
30524 // return AloBlo + Hi;
30525 KnownBits AKnown = DAG.computeKnownBits(A);
30526 KnownBits BKnown = DAG.computeKnownBits(B);
30527
30528 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
30529 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
30530 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
30531
30532 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
30533 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
30534 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
30535
30536 SDValue Zero = DAG.getConstant(0, dl, VT);
30537
30538 // Only multiply lo/hi halves that aren't known to be zero.
30539 SDValue AloBlo = Zero;
30540 if (!ALoIsZero && !BLoIsZero)
30541 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
30542
30543 SDValue AloBhi = Zero;
30544 if (!ALoIsZero && !BHiIsZero) {
30545 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
30546 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
30547 }
30548
30549 SDValue AhiBlo = Zero;
30550 if (!AHiIsZero && !BLoIsZero) {
30551 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
30552 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
30553 }
30554
30555 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
30556 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
30557
30558 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
30559}
30560
30561static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
30562 MVT VT, bool IsSigned,
30563 const X86Subtarget &Subtarget,
30564 SelectionDAG &DAG,
30565 SDValue *Low = nullptr) {
30566 unsigned NumElts = VT.getVectorNumElements();
30567
30568 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
30569 // to a vXi16 type. Do the multiplies, shift the results and pack the half
30570 // lane results back together.
30571
30572 // We'll take different approaches for signed and unsigned.
30573 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
30574 // and use pmullw to calculate the full 16-bit product.
30575 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
30576 // shift them left into the upper byte of each word. This allows us to use
30577 // pmulhw to calculate the full 16-bit product. This trick means we don't
30578 // need to sign extend the bytes to use pmullw.
30579
30580 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30581 SDValue Zero = DAG.getConstant(0, dl, VT);
30582
30583 SDValue ALo, AHi;
30584 if (IsSigned) {
30585 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
30586 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
30587 } else {
30588 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
30589 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
30590 }
30591
30592 SDValue BLo, BHi;
30593 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30594 // If the RHS is a constant, manually unpackl/unpackh and extend.
30595 SmallVector<SDValue, 16> LoOps, HiOps;
30596 for (unsigned i = 0; i != NumElts; i += 16) {
30597 for (unsigned j = 0; j != 8; ++j) {
30598 SDValue LoOp = B.getOperand(i + j);
30599 SDValue HiOp = B.getOperand(i + j + 8);
30600
30601 if (IsSigned) {
30602 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
30603 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
30604 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
30605 DAG.getConstant(8, dl, MVT::i16));
30606 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
30607 DAG.getConstant(8, dl, MVT::i16));
30608 } else {
30609 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
30610 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
30611 }
30612
30613 LoOps.push_back(LoOp);
30614 HiOps.push_back(HiOp);
30615 }
30616 }
30617
30618 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30619 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30620 } else if (IsSigned) {
30621 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
30622 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
30623 } else {
30624 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
30625 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
30626 }
30627
30628 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
30629 // pack back to vXi8.
30630 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
30631 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
30632 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
30633
30634 if (Low)
30635 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30636
30637 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
30638}
30639
30640static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
30641 SelectionDAG &DAG) {
30642 SDLoc dl(Op);
30643 MVT VT = Op.getSimpleValueType();
30644 bool IsSigned = Op->getOpcode() == ISD::MULHS;
30645 unsigned NumElts = VT.getVectorNumElements();
30646 SDValue A = Op.getOperand(0);
30647 SDValue B = Op.getOperand(1);
30648
30649 // Decompose 256-bit ops into 128-bit ops.
30650 if (VT.is256BitVector() && !Subtarget.hasInt256())
30651 return splitVectorIntBinary(Op, DAG);
30652
30653 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30654 return splitVectorIntBinary(Op, DAG);
30655
30656 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
30657 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))
30658 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))
30659 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))
;
30660
30661 // PMULxD operations multiply each even value (starting at 0) of LHS with
30662 // the related value of RHS and produce a widen result.
30663 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30664 // => <2 x i64> <ae|cg>
30665 //
30666 // In other word, to have all the results, we need to perform two PMULxD:
30667 // 1. one with the even values.
30668 // 2. one with the odd values.
30669 // To achieve #2, with need to place the odd values at an even position.
30670 //
30671 // Place the odd value at an even position (basically, shift all values 1
30672 // step to the left):
30673 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
30674 9, -1, 11, -1, 13, -1, 15, -1};
30675 // <a|b|c|d> => <b|undef|d|undef>
30676 SDValue Odd0 =
30677 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
30678 // <e|f|g|h> => <f|undef|h|undef>
30679 SDValue Odd1 =
30680 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
30681
30682 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
30683 // ints.
30684 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
30685 unsigned Opcode =
30686 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
30687 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30688 // => <2 x i64> <ae|cg>
30689 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30690 DAG.getBitcast(MulVT, A),
30691 DAG.getBitcast(MulVT, B)));
30692 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
30693 // => <2 x i64> <bf|dh>
30694 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30695 DAG.getBitcast(MulVT, Odd0),
30696 DAG.getBitcast(MulVT, Odd1)));
30697
30698 // Shuffle it back into the right order.
30699 SmallVector<int, 16> ShufMask(NumElts);
30700 for (int i = 0; i != (int)NumElts; ++i)
30701 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
30702
30703 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
30704
30705 // If we have a signed multiply but no PMULDQ fix up the result of an
30706 // unsigned multiply.
30707 if (IsSigned && !Subtarget.hasSSE41()) {
30708 SDValue Zero = DAG.getConstant(0, dl, VT);
30709 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
30710 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
30711 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
30712 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
30713
30714 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
30715 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
30716 }
30717
30718 return Res;
30719 }
30720
30721 // Only i8 vectors should need custom lowering after this.
30722 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
30723 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
30724 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
;
30725
30726 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
30727 // logical shift down the upper half and pack back to i8.
30728
30729 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
30730 // and then ashr/lshr the upper bits down to the lower bits before multiply.
30731
30732 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30733 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30734 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30735 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30736 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30737 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30738 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30739 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30740 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30741 }
30742
30743 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
30744}
30745
30746// Custom lowering for SMULO/UMULO.
30747static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
30748 SelectionDAG &DAG) {
30749 MVT VT = Op.getSimpleValueType();
30750
30751 // Scalars defer to LowerXALUO.
30752 if (!VT.isVector())
30753 return LowerXALUO(Op, DAG);
30754
30755 SDLoc dl(Op);
30756 bool IsSigned = Op->getOpcode() == ISD::SMULO;
30757 SDValue A = Op.getOperand(0);
30758 SDValue B = Op.getOperand(1);
30759 EVT OvfVT = Op->getValueType(1);
30760
30761 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
30762 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
30763 // Extract the LHS Lo/Hi vectors
30764 SDValue LHSLo, LHSHi;
30765 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
30766
30767 // Extract the RHS Lo/Hi vectors
30768 SDValue RHSLo, RHSHi;
30769 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
30770
30771 EVT LoOvfVT, HiOvfVT;
30772 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
30773 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
30774 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
30775
30776 // Issue the split operations.
30777 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
30778 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
30779
30780 // Join the separate data results and the overflow results.
30781 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30782 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
30783 Hi.getValue(1));
30784
30785 return DAG.getMergeValues({Res, Ovf}, dl);
30786 }
30787
30788 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30789 EVT SetccVT =
30790 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30791
30792 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30793 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30794 unsigned NumElts = VT.getVectorNumElements();
30795 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30796 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30797 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30798 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30799 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30800
30801 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30802
30803 SDValue Ovf;
30804 if (IsSigned) {
30805 SDValue High, LowSign;
30806 if (OvfVT.getVectorElementType() == MVT::i1 &&
30807 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30808 // Rather the truncating try to do the compare on vXi16 or vXi32.
30809 // Shift the high down filling with sign bits.
30810 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
30811 // Fill all 16 bits with the sign bit from the low.
30812 LowSign =
30813 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
30814 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
30815 15, DAG);
30816 SetccVT = OvfVT;
30817 if (!Subtarget.hasBWI()) {
30818 // We can't do a vXi16 compare so sign extend to v16i32.
30819 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30820 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30821 }
30822 } else {
30823 // Otherwise do the compare at vXi8.
30824 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30825 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30826 LowSign =
30827 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30828 }
30829
30830 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30831 } else {
30832 SDValue High =
30833 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30834 if (OvfVT.getVectorElementType() == MVT::i1 &&
30835 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30836 // Rather the truncating try to do the compare on vXi16 or vXi32.
30837 SetccVT = OvfVT;
30838 if (!Subtarget.hasBWI()) {
30839 // We can't do a vXi16 compare so sign extend to v16i32.
30840 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30841 }
30842 } else {
30843 // Otherwise do the compare at vXi8.
30844 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30845 }
30846
30847 Ovf =
30848 DAG.getSetCC(dl, SetccVT, High,
30849 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30850 }
30851
30852 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30853
30854 return DAG.getMergeValues({Low, Ovf}, dl);
30855 }
30856
30857 SDValue Low;
30858 SDValue High =
30859 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30860
30861 SDValue Ovf;
30862 if (IsSigned) {
30863 // SMULO overflows if the high bits don't match the sign of the low.
30864 SDValue LowSign =
30865 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30866 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30867 } else {
30868 // UMULO overflows if the high bits are non-zero.
30869 Ovf =
30870 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30871 }
30872
30873 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30874
30875 return DAG.getMergeValues({Low, Ovf}, dl);
30876}
30877
30878SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30879 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30879, __extension__
__PRETTY_FUNCTION__))
;
30880 EVT VT = Op.getValueType();
30881 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__
__PRETTY_FUNCTION__))
30882 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__
__PRETTY_FUNCTION__))
;
30883
30884 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30885 SmallVector<SDValue> Result;
30886 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30887 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30888 }
30889
30890 RTLIB::Libcall LC;
30891 bool isSigned;
30892 switch (Op->getOpcode()) {
30893 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30893)
;
30894 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30895 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30896 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30897 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30898 }
30899
30900 SDLoc dl(Op);
30901 SDValue InChain = DAG.getEntryNode();
30902
30903 TargetLowering::ArgListTy Args;
30904 TargetLowering::ArgListEntry Entry;
30905 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30906 EVT ArgVT = Op->getOperand(i).getValueType();
30907 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__
__PRETTY_FUNCTION__))
30908 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__
__PRETTY_FUNCTION__))
;
30909 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30910 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30911 MachinePointerInfo MPI =
30912 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30913 Entry.Node = StackPtr;
30914 InChain =
30915 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30916 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30917 Entry.Ty = PointerType::get(ArgTy,0);
30918 Entry.IsSExt = false;
30919 Entry.IsZExt = false;
30920 Args.push_back(Entry);
30921 }
30922
30923 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
30924 getPointerTy(DAG.getDataLayout()));
30925
30926 TargetLowering::CallLoweringInfo CLI(DAG);
30927 CLI.setDebugLoc(dl)
30928 .setChain(InChain)
30929 .setLibCallee(
30930 getLibcallCallingConv(LC),
30931 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30932 std::move(Args))
30933 .setInRegister()
30934 .setSExtResult(isSigned)
30935 .setZExtResult(!isSigned);
30936
30937 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30938 return DAG.getBitcast(VT, CallInfo.first);
30939}
30940
30941SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30942 SelectionDAG &DAG,
30943 SDValue &Chain) const {
30944 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30944, __extension__
__PRETTY_FUNCTION__))
;
30945 EVT VT = Op.getValueType();
30946 bool IsStrict = Op->isStrictFPOpcode();
30947
30948 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30949 EVT ArgVT = Arg.getValueType();
30950
30951 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__
__PRETTY_FUNCTION__))
30952 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__
__PRETTY_FUNCTION__))
;
30953
30954 RTLIB::Libcall LC;
30955 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30956 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30957 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30958 else
30959 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30960 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30960, __extension__
__PRETTY_FUNCTION__))
;
30961
30962 SDLoc dl(Op);
30963 MakeLibCallOptions CallOptions;
30964 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30965
30966 SDValue Result;
30967 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30968 // expected VT (i128).
30969 std::tie(Result, Chain) =
30970 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30971 Result = DAG.getBitcast(VT, Result);
30972 return Result;
30973}
30974
30975SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30976 SelectionDAG &DAG) const {
30977 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30977, __extension__
__PRETTY_FUNCTION__))
;
30978 EVT VT = Op.getValueType();
30979 bool IsStrict = Op->isStrictFPOpcode();
30980
30981 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30982 EVT ArgVT = Arg.getValueType();
30983
30984 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__
__PRETTY_FUNCTION__))
30985 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__
__PRETTY_FUNCTION__))
;
30986
30987 RTLIB::Libcall LC;
30988 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30989 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30990 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30991 else
30992 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30993 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30993, __extension__
__PRETTY_FUNCTION__))
;
30994
30995 SDLoc dl(Op);
30996 MakeLibCallOptions CallOptions;
30997 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30998
30999 // Pass the i128 argument as an indirect argument on the stack.
31000 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
31001 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31002 MachinePointerInfo MPI =
31003 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31004 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
31005
31006 SDValue Result;
31007 std::tie(Result, Chain) =
31008 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
31009 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
31010}
31011
31012// Return true if the required (according to Opcode) shift-imm form is natively
31013// supported by the Subtarget
31014static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
31015 unsigned Opcode) {
31016 if (!VT.isSimple())
31017 return false;
31018
31019 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
31020 return false;
31021
31022 if (VT.getScalarSizeInBits() < 16)
31023 return false;
31024
31025 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
31026 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
31027 return true;
31028
31029 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
31030 (VT.is256BitVector() && Subtarget.hasInt256());
31031
31032 bool AShift = LShift && (Subtarget.hasAVX512() ||
31033 (VT != MVT::v2i64 && VT != MVT::v4i64));
31034 return (Opcode == ISD::SRA) ? AShift : LShift;
31035}
31036
31037// The shift amount is a variable, but it is the same for all vector lanes.
31038// These instructions are defined together with shift-immediate.
31039static
31040bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
31041 unsigned Opcode) {
31042 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
31043}
31044
31045// Return true if the required (according to Opcode) variable-shift form is
31046// natively supported by the Subtarget
31047static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
31048 unsigned Opcode) {
31049 if (!VT.isSimple())
31050 return false;
31051
31052 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
31053 return false;
31054
31055 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
31056 return false;
31057
31058 // vXi16 supported only on AVX-512, BWI
31059 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
31060 return false;
31061
31062 if (Subtarget.hasAVX512() &&
31063 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
31064 return true;
31065
31066 bool LShift = VT.is128BitVector() || VT.is256BitVector();
31067 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
31068 return (Opcode == ISD::SRA) ? AShift : LShift;
31069}
31070
31071static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
31072 const X86Subtarget &Subtarget) {
31073 MVT VT = Op.getSimpleValueType();
31074 SDLoc dl(Op);
31075 SDValue R = Op.getOperand(0);
31076 SDValue Amt = Op.getOperand(1);
31077 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
31078
31079 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
31080 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31080, __extension__
__PRETTY_FUNCTION__))
;
31081 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
31082 SDValue Ex = DAG.getBitcast(ExVT, R);
31083
31084 // ashr(R, 63) === cmp_slt(R, 0)
31085 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
31086 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__
__PRETTY_FUNCTION__))
31087 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__
__PRETTY_FUNCTION__))
;
31088 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
31089 }
31090
31091 if (ShiftAmt >= 32) {
31092 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
31093 SDValue Upper =
31094 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
31095 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
31096 ShiftAmt - 32, DAG);
31097 if (VT == MVT::v2i64)
31098 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
31099 if (VT == MVT::v4i64)
31100 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
31101 {9, 1, 11, 3, 13, 5, 15, 7});
31102 } else {
31103 // SRA upper i32, SRL whole i64 and select lower i32.
31104 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
31105 ShiftAmt, DAG);
31106 SDValue Lower =
31107 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
31108 Lower = DAG.getBitcast(ExVT, Lower);
31109 if (VT == MVT::v2i64)
31110 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
31111 if (VT == MVT::v4i64)
31112 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
31113 {8, 1, 10, 3, 12, 5, 14, 7});
31114 }
31115 return DAG.getBitcast(VT, Ex);
31116 };
31117
31118 // Optimize shl/srl/sra with constant shift amount.
31119 APInt APIntShiftAmt;
31120 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
31121 return SDValue();
31122
31123 // If the shift amount is out of range, return undef.
31124 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
31125 return DAG.getUNDEF(VT);
31126
31127 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
31128
31129 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
31130 // Hardware support for vector shifts is sparse which makes us scalarize the
31131 // vector operations in many cases. Also, on sandybridge ADD is faster than
31132 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
31133 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
31134 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
31135 // must be 0). (add undef, undef) however can be any value. To make this
31136 // safe, we must freeze R to ensure that register allocation uses the same
31137 // register for an undefined value. This ensures that the result will
31138 // still be even and preserves the original semantics.
31139 R = DAG.getFreeze(R);
31140 return DAG.getNode(ISD::ADD, dl, VT, R, R);
31141 }
31142
31143 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
31144 }
31145
31146 // i64 SRA needs to be performed as partial shifts.
31147 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
31148 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
31149 Op.getOpcode() == ISD::SRA)
31150 return ArithmeticShiftRight64(ShiftAmt);
31151
31152 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31153 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
31154 unsigned NumElts = VT.getVectorNumElements();
31155 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31156
31157 // Simple i8 add case
31158 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
31159 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
31160 // must be 0). (add undef, undef) however can be any value. To make this
31161 // safe, we must freeze R to ensure that register allocation uses the same
31162 // register for an undefined value. This ensures that the result will
31163 // still be even and preserves the original semantics.
31164 R = DAG.getFreeze(R);
31165 return DAG.getNode(ISD::ADD, dl, VT, R, R);
31166 }
31167
31168 // ashr(R, 7) === cmp_slt(R, 0)
31169 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
31170 SDValue Zeros = DAG.getConstant(0, dl, VT);
31171 if (VT.is512BitVector()) {
31172 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31172, __extension__
__PRETTY_FUNCTION__))
;
31173 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
31174 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
31175 }
31176 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
31177 }
31178
31179 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
31180 if (VT == MVT::v16i8 && Subtarget.hasXOP())
31181 return SDValue();
31182
31183 if (Op.getOpcode() == ISD::SHL) {
31184 // Make a large shift.
31185 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
31186 ShiftAmt, DAG);
31187 SHL = DAG.getBitcast(VT, SHL);
31188 // Zero out the rightmost bits.
31189 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
31190 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
31191 }
31192 if (Op.getOpcode() == ISD::SRL) {
31193 // Make a large shift.
31194 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
31195 ShiftAmt, DAG);
31196 SRL = DAG.getBitcast(VT, SRL);
31197 // Zero out the leftmost bits.
31198 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
31199 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
31200 }
31201 if (Op.getOpcode() == ISD::SRA) {
31202 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
31203 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31204
31205 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
31206 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
31207 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
31208 return Res;
31209 }
31210 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31210)
;
31211 }
31212
31213 return SDValue();
31214}
31215
31216static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
31217 const X86Subtarget &Subtarget) {
31218 MVT VT = Op.getSimpleValueType();
31219 SDLoc dl(Op);
31220 SDValue R = Op.getOperand(0);
31221 SDValue Amt = Op.getOperand(1);
31222 unsigned Opcode = Op.getOpcode();
31223 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
31224
31225 int BaseShAmtIdx = -1;
31226 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
31227 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
31228 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
31229 Subtarget, DAG);
31230
31231 // vXi8 shifts - shift as v8i16 + mask result.
31232 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
31233 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
31234 VT == MVT::v64i8) &&
31235 !Subtarget.hasXOP()) {
31236 unsigned NumElts = VT.getVectorNumElements();
31237 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31238 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
31239 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
31240 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
31241
31242 // Create the mask using vXi16 shifts. For shift-rights we need to move
31243 // the upper byte down before splatting the vXi8 mask.
31244 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
31245 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
31246 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
31247 if (Opcode != ISD::SHL)
31248 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
31249 8, DAG);
31250 BitMask = DAG.getBitcast(VT, BitMask);
31251 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
31252 SmallVector<int, 64>(NumElts, 0));
31253
31254 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
31255 DAG.getBitcast(ExtVT, R), BaseShAmt,
31256 BaseShAmtIdx, Subtarget, DAG);
31257 Res = DAG.getBitcast(VT, Res);
31258 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
31259
31260 if (Opcode == ISD::SRA) {
31261 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
31262 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
31263 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
31264 SignMask =
31265 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
31266 BaseShAmtIdx, Subtarget, DAG);
31267 SignMask = DAG.getBitcast(VT, SignMask);
31268 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
31269 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
31270 }
31271 return Res;
31272 }
31273 }
31274 }
31275
31276 return SDValue();
31277}
31278
31279// Convert a shift/rotate left amount to a multiplication scale factor.
31280static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
31281 const X86Subtarget &Subtarget,
31282 SelectionDAG &DAG) {
31283 MVT VT = Amt.getSimpleValueType();
31284 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
31285 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
31286 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
31287 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
31288 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31289 (Subtarget.hasBWI() && VT == MVT::v64i8)))
31290 return SDValue();
31291
31292 MVT SVT = VT.getVectorElementType();
31293 unsigned SVTBits = SVT.getSizeInBits();
31294 unsigned NumElems = VT.getVectorNumElements();
31295
31296 APInt UndefElts;
31297 SmallVector<APInt> EltBits;
31298 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
31299 APInt One(SVTBits, 1);
31300 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
31301 for (unsigned I = 0; I != NumElems; ++I) {
31302 if (UndefElts[I] || EltBits[I].uge(SVTBits))
31303 continue;
31304 uint64_t ShAmt = EltBits[I].getZExtValue();
31305 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
31306 }
31307 return DAG.getBuildVector(VT, dl, Elts);
31308 }
31309
31310 // If the target doesn't support variable shifts, use either FP conversion
31311 // or integer multiplication to avoid shifting each element individually.
31312 if (VT == MVT::v4i32) {
31313 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
31314 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
31315 DAG.getConstant(0x3f800000U, dl, VT));
31316 Amt = DAG.getBitcast(MVT::v4f32, Amt);
31317 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
31318 }
31319
31320 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
31321 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
31322 SDValue Z = DAG.getConstant(0, dl, VT);
31323 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
31324 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
31325 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
31326 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
31327 if (Subtarget.hasSSE41())
31328 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31329 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
31330 }
31331
31332 return SDValue();
31333}
31334
31335static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
31336 SelectionDAG &DAG) {
31337 MVT VT = Op.getSimpleValueType();
31338 SDLoc dl(Op);
31339 SDValue R = Op.getOperand(0);
31340 SDValue Amt = Op.getOperand(1);
31341 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31342 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31343
31344 unsigned Opc = Op.getOpcode();
31345 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
31346 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
31347
31348 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31348, __extension__
__PRETTY_FUNCTION__))
;
31349 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31349, __extension__
__PRETTY_FUNCTION__))
;
31350
31351 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
31352 return V;
31353
31354 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
31355 return V;
31356
31357 if (supportedVectorVarShift(VT, Subtarget, Opc))
31358 return Op;
31359
31360 // i64 vector arithmetic shift can be emulated with the transform:
31361 // M = lshr(SIGN_MASK, Amt)
31362 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
31363 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
31364 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
31365 Opc == ISD::SRA) {
31366 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
31367 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
31368 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31369 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
31370 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
31371 return R;
31372 }
31373
31374 // XOP has 128-bit variable logical/arithmetic shifts.
31375 // +ve/-ve Amt = shift left/right.
31376 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
31377 VT == MVT::v8i16 || VT == MVT::v16i8)) {
31378 if (Opc == ISD::SRL || Opc == ISD::SRA) {
31379 SDValue Zero = DAG.getConstant(0, dl, VT);
31380 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
31381 }
31382 if (Opc == ISD::SHL || Opc == ISD::SRL)
31383 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
31384 if (Opc == ISD::SRA)
31385 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
31386 }
31387
31388 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
31389 // shifts per-lane and then shuffle the partial results back together.
31390 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
31391 // Splat the shift amounts so the scalar shifts above will catch it.
31392 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
31393 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
31394 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
31395 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
31396 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
31397 }
31398
31399 // If possible, lower this shift as a sequence of two shifts by
31400 // constant plus a BLENDing shuffle instead of scalarizing it.
31401 // Example:
31402 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
31403 //
31404 // Could be rewritten as:
31405 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
31406 //
31407 // The advantage is that the two shifts from the example would be
31408 // lowered as X86ISD::VSRLI nodes in parallel before blending.
31409 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
31410 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31411 SDValue Amt1, Amt2;
31412 unsigned NumElts = VT.getVectorNumElements();
31413 SmallVector<int, 8> ShuffleMask;
31414 for (unsigned i = 0; i != NumElts; ++i) {
31415 SDValue A = Amt->getOperand(i);
31416 if (A.isUndef()) {
31417 ShuffleMask.push_back(SM_SentinelUndef);
31418 continue;
31419 }
31420 if (!Amt1 || Amt1 == A) {
31421 ShuffleMask.push_back(i);
31422 Amt1 = A;
31423 continue;
31424 }
31425 if (!Amt2 || Amt2 == A) {
31426 ShuffleMask.push_back(i + NumElts);
31427 Amt2 = A;
31428 continue;
31429 }
31430 break;
31431 }
31432
31433 // Only perform this blend if we can perform it without loading a mask.
31434 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
31435 (VT != MVT::v16i16 ||
31436 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
31437 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
31438 canWidenShuffleElements(ShuffleMask))) {
31439 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
31440 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
31441 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
31442 Cst2->getAPIntValue().ult(EltSizeInBits)) {
31443 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31444 Cst1->getZExtValue(), DAG);
31445 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31446 Cst2->getZExtValue(), DAG);
31447 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
31448 }
31449 }
31450 }
31451
31452 // If possible, lower this packed shift into a vector multiply instead of
31453 // expanding it into a sequence of scalar shifts.
31454 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
31455 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
31456 Subtarget.canExtendTo512BW())))
31457 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
31458 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
31459
31460 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
31461 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
31462 if (Opc == ISD::SRL && ConstantAmt &&
31463 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31464 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31465 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31466 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31467 SDValue Zero = DAG.getConstant(0, dl, VT);
31468 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
31469 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
31470 return DAG.getSelect(dl, VT, ZAmt, R, Res);
31471 }
31472 }
31473
31474 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
31475 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
31476 // TODO: Special case handling for shift by 0/1, really we can afford either
31477 // of these cases in pre-SSE41/XOP/AVX512 but not both.
31478 if (Opc == ISD::SRA && ConstantAmt &&
31479 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
31480 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
31481 !Subtarget.hasAVX512()) ||
31482 DAG.isKnownNeverZero(Amt))) {
31483 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31484 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31485 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31486 SDValue Amt0 =
31487 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
31488 SDValue Amt1 =
31489 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
31490 SDValue Sra1 =
31491 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
31492 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
31493 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
31494 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
31495 }
31496 }
31497
31498 // v4i32 Non Uniform Shifts.
31499 // If the shift amount is constant we can shift each lane using the SSE2
31500 // immediate shifts, else we need to zero-extend each lane to the lower i64
31501 // and shift using the SSE2 variable shifts.
31502 // The separate results can then be blended together.
31503 if (VT == MVT::v4i32) {
31504 SDValue Amt0, Amt1, Amt2, Amt3;
31505 if (ConstantAmt) {
31506 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
31507 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
31508 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
31509 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
31510 } else {
31511 // The SSE2 shifts use the lower i64 as the same shift amount for
31512 // all lanes and the upper i64 is ignored. On AVX we're better off
31513 // just zero-extending, but for SSE just duplicating the top 16-bits is
31514 // cheaper and has the same effect for out of range values.
31515 if (Subtarget.hasAVX()) {
31516 SDValue Z = DAG.getConstant(0, dl, VT);
31517 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
31518 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
31519 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
31520 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
31521 } else {
31522 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
31523 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
31524 {4, 5, 6, 7, -1, -1, -1, -1});
31525 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
31526 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
31527 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
31528 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
31529 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
31530 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
31531 }
31532 }
31533
31534 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
31535 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
31536 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
31537 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
31538 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
31539
31540 // Merge the shifted lane results optimally with/without PBLENDW.
31541 // TODO - ideally shuffle combining would handle this.
31542 if (Subtarget.hasSSE41()) {
31543 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
31544 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
31545 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
31546 }
31547 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
31548 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
31549 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
31550 }
31551
31552 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
31553 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
31554 // make the existing SSE solution better.
31555 // NOTE: We honor prefered vector width before promoting to 512-bits.
31556 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
31557 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
31558 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
31559 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
31560 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
31561 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__
__PRETTY_FUNCTION__))
31562 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__
__PRETTY_FUNCTION__))
;
31563 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
31564 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
31565 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31566 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
31567 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
31568 return DAG.getNode(ISD::TRUNCATE, dl, VT,
31569 DAG.getNode(Opc, dl, ExtVT, R, Amt));
31570 }
31571
31572 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
31573 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
31574 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
31575 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
31576 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
31577 !Subtarget.hasXOP()) {
31578 int NumElts = VT.getVectorNumElements();
31579 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
31580
31581 // Extend constant shift amount to vXi16 (it doesn't matter if the type
31582 // isn't legal).
31583 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
31584 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
31585 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
31586 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
31587 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__
__PRETTY_FUNCTION__))
31588 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__
__PRETTY_FUNCTION__))
;
31589
31590 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
31591 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
31592 : DAG.getZExtOrTrunc(R, dl, ExVT);
31593 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
31594 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
31595 return DAG.getZExtOrTrunc(R, dl, VT);
31596 }
31597
31598 SmallVector<SDValue, 16> LoAmt, HiAmt;
31599 for (int i = 0; i != NumElts; i += 16) {
31600 for (int j = 0; j != 8; ++j) {
31601 LoAmt.push_back(Amt.getOperand(i + j));
31602 HiAmt.push_back(Amt.getOperand(i + j + 8));
31603 }
31604 }
31605
31606 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
31607 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
31608 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
31609
31610 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
31611 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
31612 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
31613 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
31614 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
31615 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
31616 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
31617 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
31618 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
31619 }
31620
31621 if (VT == MVT::v16i8 ||
31622 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31623 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31624 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
31625
31626 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31627 if (VT.is512BitVector()) {
31628 // On AVX512BW targets we make use of the fact that VSELECT lowers
31629 // to a masked blend which selects bytes based just on the sign bit
31630 // extracted to a mask.
31631 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
31632 V0 = DAG.getBitcast(VT, V0);
31633 V1 = DAG.getBitcast(VT, V1);
31634 Sel = DAG.getBitcast(VT, Sel);
31635 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31636 ISD::SETGT);
31637 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31638 } else if (Subtarget.hasSSE41()) {
31639 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31640 // on the sign bit.
31641 V0 = DAG.getBitcast(VT, V0);
31642 V1 = DAG.getBitcast(VT, V1);
31643 Sel = DAG.getBitcast(VT, Sel);
31644 return DAG.getBitcast(SelVT,
31645 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31646 }
31647 // On pre-SSE41 targets we test for the sign bit by comparing to
31648 // zero - a negative value will set all bits of the lanes to true
31649 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31650 SDValue Z = DAG.getConstant(0, dl, SelVT);
31651 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31652 return DAG.getSelect(dl, SelVT, C, V0, V1);
31653 };
31654
31655 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31656 // We can safely do this using i16 shifts as we're only interested in
31657 // the 3 lower bits of each byte.
31658 Amt = DAG.getBitcast(ExtVT, Amt);
31659 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31660 Amt = DAG.getBitcast(VT, Amt);
31661
31662 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31663 // r = VSELECT(r, shift(r, 4), a);
31664 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31665 R = SignBitSelect(VT, Amt, M, R);
31666
31667 // a += a
31668 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31669
31670 // r = VSELECT(r, shift(r, 2), a);
31671 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31672 R = SignBitSelect(VT, Amt, M, R);
31673
31674 // a += a
31675 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31676
31677 // return VSELECT(r, shift(r, 1), a);
31678 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31679 R = SignBitSelect(VT, Amt, M, R);
31680 return R;
31681 }
31682
31683 if (Opc == ISD::SRA) {
31684 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31685 // so we can correctly sign extend. We don't care what happens to the
31686 // lower byte.
31687 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31688 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31689 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31690 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31691 ALo = DAG.getBitcast(ExtVT, ALo);
31692 AHi = DAG.getBitcast(ExtVT, AHi);
31693 RLo = DAG.getBitcast(ExtVT, RLo);
31694 RHi = DAG.getBitcast(ExtVT, RHi);
31695
31696 // r = VSELECT(r, shift(r, 4), a);
31697 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31698 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31699 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31700 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31701
31702 // a += a
31703 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31704 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31705
31706 // r = VSELECT(r, shift(r, 2), a);
31707 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31708 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31709 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31710 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31711
31712 // a += a
31713 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31714 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31715
31716 // r = VSELECT(r, shift(r, 1), a);
31717 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31718 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31719 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31720 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31721
31722 // Logical shift the result back to the lower byte, leaving a zero upper
31723 // byte meaning that we can safely pack with PACKUSWB.
31724 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31725 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31726 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31727 }
31728 }
31729
31730 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31731 MVT ExtVT = MVT::v8i32;
31732 SDValue Z = DAG.getConstant(0, dl, VT);
31733 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31734 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31735 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31736 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31737 ALo = DAG.getBitcast(ExtVT, ALo);
31738 AHi = DAG.getBitcast(ExtVT, AHi);
31739 RLo = DAG.getBitcast(ExtVT, RLo);
31740 RHi = DAG.getBitcast(ExtVT, RHi);
31741 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31742 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31743 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31744 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31745 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31746 }
31747
31748 if (VT == MVT::v8i16) {
31749 // If we have a constant shift amount, the non-SSE41 path is best as
31750 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31751 bool UseSSE41 = Subtarget.hasSSE41() &&
31752 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31753
31754 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31755 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31756 // the sign bit.
31757 if (UseSSE41) {
31758 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
31759 V0 = DAG.getBitcast(ExtVT, V0);
31760 V1 = DAG.getBitcast(ExtVT, V1);
31761 Sel = DAG.getBitcast(ExtVT, Sel);
31762 return DAG.getBitcast(
31763 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31764 }
31765 // On pre-SSE41 targets we splat the sign bit - a negative value will
31766 // set all bits of the lanes to true and VSELECT uses that in
31767 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31768 SDValue C =
31769 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31770 return DAG.getSelect(dl, VT, C, V0, V1);
31771 };
31772
31773 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31774 if (UseSSE41) {
31775 // On SSE41 targets we need to replicate the shift mask in both
31776 // bytes for PBLENDVB.
31777 Amt = DAG.getNode(
31778 ISD::OR, dl, VT,
31779 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31780 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31781 } else {
31782 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31783 }
31784
31785 // r = VSELECT(r, shift(r, 8), a);
31786 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31787 R = SignBitSelect(Amt, M, R);
31788
31789 // a += a
31790 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31791
31792 // r = VSELECT(r, shift(r, 4), a);
31793 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31794 R = SignBitSelect(Amt, M, R);
31795
31796 // a += a
31797 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31798
31799 // r = VSELECT(r, shift(r, 2), a);
31800 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31801 R = SignBitSelect(Amt, M, R);
31802
31803 // a += a
31804 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31805
31806 // return VSELECT(r, shift(r, 1), a);
31807 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31808 R = SignBitSelect(Amt, M, R);
31809 return R;
31810 }
31811
31812 // Decompose 256-bit shifts into 128-bit shifts.
31813 if (VT.is256BitVector())
31814 return splitVectorIntBinary(Op, DAG);
31815
31816 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31817 return splitVectorIntBinary(Op, DAG);
31818
31819 return SDValue();
31820}
31821
31822static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
31823 SelectionDAG &DAG) {
31824 MVT VT = Op.getSimpleValueType();
31825 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__
__PRETTY_FUNCTION__))
31826 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__
__PRETTY_FUNCTION__))
;
31827
31828 SDLoc DL(Op);
31829 SDValue Op0 = Op.getOperand(0);
31830 SDValue Op1 = Op.getOperand(1);
31831 SDValue Amt = Op.getOperand(2);
31832 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31833 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31834
31835 if (VT.isVector()) {
31836 APInt APIntShiftAmt;
31837 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31838
31839 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31840 if (IsFSHR)
31841 std::swap(Op0, Op1);
31842
31843 if (IsCstSplat) {
31844 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31845 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31846 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31847 {Op0, Op1, Imm}, DAG, Subtarget);
31848 }
31849 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31850 {Op0, Op1, Amt}, DAG, Subtarget);
31851 }
31852 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
31853 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
31854 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
31855 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))
;
31856
31857 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31858 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31859 if (IsCstSplat)
31860 return SDValue();
31861
31862 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31863 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31864 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31865
31866 // Constant vXi16 funnel shifts can be efficiently handled by default.
31867 if (IsCst && EltSizeInBits == 16)
31868 return SDValue();
31869
31870 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31871 unsigned NumElts = VT.getVectorNumElements();
31872 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31873 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31874
31875 // Split 256-bit integers on XOP/pre-AVX2 targets.
31876 // Split 512-bit integers on non 512-bit BWI targets.
31877 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31878 !Subtarget.hasAVX2())) ||
31879 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31880 EltSizeInBits < 32)) {
31881 // Pre-mask the amount modulo using the wider vector.
31882 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31883 return splitVectorOp(Op, DAG);
31884 }
31885
31886 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31887 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31888 int ScalarAmtIdx = -1;
31889 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31890 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31891 if (EltSizeInBits == 16)
31892 return SDValue();
31893
31894 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31895 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31896 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31897 ScalarAmtIdx, Subtarget, DAG);
31898 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31899 ScalarAmtIdx, Subtarget, DAG);
31900 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31901 }
31902 }
31903
31904 MVT WideSVT = MVT::getIntegerVT(
31905 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31906 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31907
31908 // If per-element shifts are legal, fallback to generic expansion.
31909 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31910 return SDValue();
31911
31912 // Attempt to fold as:
31913 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31914 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31915 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31916 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31917 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31918 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31919 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31920 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31921 EltSizeInBits, DAG);
31922 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31923 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31924 if (!IsFSHR)
31925 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31926 EltSizeInBits, DAG);
31927 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31928 }
31929
31930 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31931 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31932 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31933 SDValue Z = DAG.getConstant(0, DL, VT);
31934 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31935 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31936 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31937 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31938 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31939 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31940 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31941 }
31942
31943 // Fallback to generic expansion.
31944 return SDValue();
31945 }
31946 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))
31947 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))
31948 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))
;
31949
31950 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31951 bool OptForSize = DAG.shouldOptForSize();
31952 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31953
31954 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31955 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31956 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31957 !isa<ConstantSDNode>(Amt)) {
31958 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31959 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31960 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31961 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31962 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31963 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31964 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31965 if (IsFSHR) {
31966 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31967 } else {
31968 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31969 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31970 }
31971 return DAG.getZExtOrTrunc(Res, DL, VT);
31972 }
31973
31974 if (VT == MVT::i8 || ExpandFunnel)
31975 return SDValue();
31976
31977 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31978 if (VT == MVT::i16) {
31979 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31980 DAG.getConstant(15, DL, Amt.getValueType()));
31981 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31982 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31983 }
31984
31985 return Op;
31986}
31987
31988static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31989 SelectionDAG &DAG) {
31990 MVT VT = Op.getSimpleValueType();
31991 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31991, __extension__
__PRETTY_FUNCTION__))
;
31992
31993 SDLoc DL(Op);
31994 SDValue R = Op.getOperand(0);
31995 SDValue Amt = Op.getOperand(1);
31996 unsigned Opcode = Op.getOpcode();
31997 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31998 int NumElts = VT.getVectorNumElements();
31999 bool IsROTL = Opcode == ISD::ROTL;
32000
32001 // Check for constant splat rotation amount.
32002 APInt CstSplatValue;
32003 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
32004
32005 // Check for splat rotate by zero.
32006 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
32007 return R;
32008
32009 // AVX512 implicitly uses modulo rotation amounts.
32010 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
32011 // Attempt to rotate by immediate.
32012 if (IsCstSplat) {
32013 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
32014 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
32015 return DAG.getNode(RotOpc, DL, VT, R,
32016 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
32017 }
32018
32019 // Else, fall-back on VPROLV/VPRORV.
32020 return Op;
32021 }
32022
32023 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
32024 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
32025 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
32026 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
32027 }
32028
32029 SDValue Z = DAG.getConstant(0, DL, VT);
32030
32031 if (!IsROTL) {
32032 // If the ISD::ROTR amount is constant, we're always better converting to
32033 // ISD::ROTL.
32034 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
32035 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
32036
32037 // XOP targets always prefers ISD::ROTL.
32038 if (Subtarget.hasXOP())
32039 return DAG.getNode(ISD::ROTL, DL, VT, R,
32040 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
32041 }
32042
32043 // Split 256-bit integers on XOP/pre-AVX2 targets.
32044 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
32045 return splitVectorIntBinary(Op, DAG);
32046
32047 // XOP has 128-bit vector variable + immediate rotates.
32048 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
32049 // XOP implicitly uses modulo rotation amounts.
32050 if (Subtarget.hasXOP()) {
32051 assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__))
;
32052 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32052, __extension__
__PRETTY_FUNCTION__))
;
32053
32054 // Attempt to rotate by immediate.
32055 if (IsCstSplat) {
32056 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
32057 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
32058 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
32059 }
32060
32061 // Use general rotate by variable (per-element).
32062 return Op;
32063 }
32064
32065 // Rotate by an uniform constant - expand back to shifts.
32066 if (IsCstSplat)
32067 return SDValue();
32068
32069 // Split 512-bit integers on non 512-bit BWI targets.
32070 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
32071 return splitVectorIntBinary(Op, DAG);
32072
32073 assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32074 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32075 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32076 Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32077 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
32078 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))
;
32079
32080 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
32081 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
32082
32083 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
32084 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32085
32086 // Attempt to fold as unpack(x,x) << zext(splat(y)):
32087 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
32088 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
32089 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
32090 int BaseRotAmtIdx = -1;
32091 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
32092 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
32093 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
32094 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
32095 }
32096 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
32097 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
32098 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
32099 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
32100 BaseRotAmtIdx, Subtarget, DAG);
32101 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
32102 BaseRotAmtIdx, Subtarget, DAG);
32103 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
32104 }
32105 }
32106
32107 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
32108 // the amount bit.
32109 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
32110 if (EltSizeInBits == 8) {
32111 bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
32112 MVT WideVT =
32113 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
32114 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
32115
32116 // Attempt to fold as:
32117 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
32118 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
32119 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
32120 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
32121 // If we're rotating by constant, just use default promotion.
32122 if (IsConstAmt)
32123 return SDValue();
32124 // See if we can perform this by widening to vXi16 or vXi32.
32125 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
32126 R = DAG.getNode(
32127 ISD::OR, DL, WideVT, R,
32128 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
32129 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
32130 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
32131 if (IsROTL)
32132 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
32133 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
32134 }
32135
32136 // Attempt to fold as unpack(x,x) << zext(y):
32137 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
32138 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
32139 if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
32140 // See if we can perform this by unpacking to lo/hi vXi16.
32141 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
32142 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
32143 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
32144 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
32145 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
32146 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
32147 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
32148 }
32149 assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32149, __extension__
__PRETTY_FUNCTION__))
;
32150
32151 // We don't need ModuloAmt here as we just peek at individual bits.
32152 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
32153 if (Subtarget.hasSSE41()) {
32154 // On SSE41 targets we can use PBLENDVB which selects bytes based just
32155 // on the sign bit.
32156 V0 = DAG.getBitcast(VT, V0);
32157 V1 = DAG.getBitcast(VT, V1);
32158 Sel = DAG.getBitcast(VT, Sel);
32159 return DAG.getBitcast(SelVT,
32160 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
32161 }
32162 // On pre-SSE41 targets we test for the sign bit by comparing to
32163 // zero - a negative value will set all bits of the lanes to true
32164 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
32165 SDValue Z = DAG.getConstant(0, DL, SelVT);
32166 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
32167 return DAG.getSelect(DL, SelVT, C, V0, V1);
32168 };
32169
32170 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
32171 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
32172 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32173 IsROTL = true;
32174 }
32175
32176 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
32177 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
32178
32179 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
32180 // We can safely do this using i16 shifts as we're only interested in
32181 // the 3 lower bits of each byte.
32182 Amt = DAG.getBitcast(ExtVT, Amt);
32183 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
32184 Amt = DAG.getBitcast(VT, Amt);
32185
32186 // r = VSELECT(r, rot(r, 4), a);
32187 SDValue M;
32188 M = DAG.getNode(
32189 ISD::OR, DL, VT,
32190 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
32191 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
32192 R = SignBitSelect(VT, Amt, M, R);
32193
32194 // a += a
32195 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32196
32197 // r = VSELECT(r, rot(r, 2), a);
32198 M = DAG.getNode(
32199 ISD::OR, DL, VT,
32200 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
32201 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
32202 R = SignBitSelect(VT, Amt, M, R);
32203
32204 // a += a
32205 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32206
32207 // return VSELECT(r, rot(r, 1), a);
32208 M = DAG.getNode(
32209 ISD::OR, DL, VT,
32210 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
32211 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
32212 return SignBitSelect(VT, Amt, M, R);
32213 }
32214
32215 bool IsSplatAmt = DAG.isSplatValue(Amt);
32216 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
32217 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
32218 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
32219
32220 // Fallback for splats + all supported variable shifts.
32221 // Fallback for non-constants AVX2 vXi16 as well.
32222 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
32223 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32224 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
32225 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
32226 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
32227 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
32228 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
32229 }
32230
32231 // Everything below assumes ISD::ROTL.
32232 if (!IsROTL) {
32233 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32234 IsROTL = true;
32235 }
32236
32237 // ISD::ROT* uses modulo rotate amounts.
32238 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32239
32240 assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32240, __extension__
__PRETTY_FUNCTION__))
;
32241
32242 // As with shifts, attempt to convert the rotation amount to a multiplication
32243 // factor, fallback to general expansion.
32244 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
32245 if (!Scale)
32246 return SDValue();
32247
32248 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
32249 if (EltSizeInBits == 16) {
32250 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
32251 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
32252 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32253 }
32254
32255 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
32256 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
32257 // that can then be OR'd with the lower 32-bits.
32258 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32258, __extension__
__PRETTY_FUNCTION__))
;
32259 static const int OddMask[] = {1, -1, 3, -1};
32260 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
32261 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
32262
32263 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32264 DAG.getBitcast(MVT::v2i64, R),
32265 DAG.getBitcast(MVT::v2i64, Scale));
32266 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32267 DAG.getBitcast(MVT::v2i64, R13),
32268 DAG.getBitcast(MVT::v2i64, Scale13));
32269 Res02 = DAG.getBitcast(VT, Res02);
32270 Res13 = DAG.getBitcast(VT, Res13);
32271
32272 return DAG.getNode(ISD::OR, DL, VT,
32273 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
32274 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
32275}
32276
32277/// Returns true if the operand type is exactly twice the native width, and
32278/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
32279/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
32280/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
32281bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
32282 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
32283
32284 if (OpWidth == 64)
32285 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
32286 if (OpWidth == 128)
32287 return Subtarget.canUseCMPXCHG16B();
32288
32289 return false;
32290}
32291
32292TargetLoweringBase::AtomicExpansionKind
32293X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
32294 Type *MemType = SI->getValueOperand()->getType();
32295
32296 bool NoImplicitFloatOps =
32297 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32298 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32299 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32300 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32301 return AtomicExpansionKind::None;
32302
32303 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
32304 : AtomicExpansionKind::None;
32305}
32306
32307// Note: this turns large loads into lock cmpxchg8b/16b.
32308// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
32309TargetLowering::AtomicExpansionKind
32310X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
32311 Type *MemType = LI->getType();
32312
32313 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
32314 // can use movq to do the load. If we have X87 we can load into an 80-bit
32315 // X87 register and store it to a stack temporary.
32316 bool NoImplicitFloatOps =
32317 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32318 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32319 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32320 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32321 return AtomicExpansionKind::None;
32322
32323 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32324 : AtomicExpansionKind::None;
32325}
32326
32327enum BitTestKind : unsigned {
32328 UndefBit,
32329 ConstantBit,
32330 NotConstantBit,
32331 ShiftBit,
32332 NotShiftBit
32333};
32334
32335static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
32336 using namespace llvm::PatternMatch;
32337 BitTestKind BTK = UndefBit;
32338 auto *C = dyn_cast<ConstantInt>(V);
32339 if (C) {
32340 // Check if V is a power of 2 or NOT power of 2.
32341 if (isPowerOf2_64(C->getZExtValue()))
32342 BTK = ConstantBit;
32343 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
32344 BTK = NotConstantBit;
32345 return {V, BTK};
32346 }
32347
32348 // Check if V is some power of 2 pattern known to be non-zero
32349 auto *I = dyn_cast<Instruction>(V);
32350 if (I) {
32351 bool Not = false;
32352 // Check if we have a NOT
32353 Value *PeekI;
32354 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
32355 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
32356 Not = true;
32357 I = dyn_cast<Instruction>(PeekI);
32358
32359 // If I is constant, it will fold and we can evaluate later. If its an
32360 // argument or something of that nature, we can't analyze.
32361 if (I == nullptr)
32362 return {nullptr, UndefBit};
32363 }
32364 // We can only use 1 << X without more sophisticated analysis. C << X where
32365 // C is a power of 2 but not 1 can result in zero which cannot be translated
32366 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
32367 if (I->getOpcode() == Instruction::Shl) {
32368 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
32369 // -X` and some other provable power of 2 patterns that we can use CTZ on
32370 // may be profitable.
32371 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
32372 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
32373 // be provably a non-zero power of 2.
32374 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
32375 // transformable to bittest.
32376 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
32377 if (!ShiftVal)
32378 return {nullptr, UndefBit};
32379 if (ShiftVal->equalsInt(1))
32380 BTK = Not ? NotShiftBit : ShiftBit;
32381
32382 if (BTK == UndefBit)
32383 return {nullptr, UndefBit};
32384
32385 Value *BitV = I->getOperand(1);
32386
32387 Value *AndOp;
32388 const APInt *AndC;
32389 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
32390 // Read past a shiftmask instruction to find count
32391 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
32392 BitV = AndOp;
32393 }
32394 return {BitV, BTK};
32395 }
32396 }
32397 return {nullptr, UndefBit};
32398}
32399
32400TargetLowering::AtomicExpansionKind
32401X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
32402 using namespace llvm::PatternMatch;
32403 // If the atomicrmw's result isn't actually used, we can just add a "lock"
32404 // prefix to a normal instruction for these operations.
32405 if (AI->use_empty())
32406 return AtomicExpansionKind::None;
32407
32408 if (AI->getOperation() == AtomicRMWInst::Xor) {
32409 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
32410 // preferable to both `cmpxchg` and `btc`.
32411 if (match(AI->getOperand(1), m_SignMask()))
32412 return AtomicExpansionKind::None;
32413 }
32414
32415 // If the atomicrmw's result is used by a single bit AND, we may use
32416 // bts/btr/btc instruction for these operations.
32417 // Note: InstCombinePass can cause a de-optimization here. It replaces the
32418 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
32419 // (depending on CC). This pattern can only use bts/btr/btc but we don't
32420 // detect it.
32421 Instruction *I = AI->user_back();
32422 auto BitChange = FindSingleBitChange(AI->getValOperand());
32423 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
32424 I->getOpcode() != Instruction::And ||
32425 AI->getType()->getPrimitiveSizeInBits() == 8 ||
32426 AI->getParent() != I->getParent())
32427 return AtomicExpansionKind::CmpXChg;
32428
32429 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
32430
32431 // This is a redundant AND, it should get cleaned up elsewhere.
32432 if (AI == I->getOperand(OtherIdx))
32433 return AtomicExpansionKind::CmpXChg;
32434
32435 // The following instruction must be a AND single bit.
32436 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
32437 auto *C1 = cast<ConstantInt>(AI->getValOperand());
32438 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
32439 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
32440 return AtomicExpansionKind::CmpXChg;
32441 }
32442 if (AI->getOperation() == AtomicRMWInst::And) {
32443 return ~C1->getValue() == C2->getValue()
32444 ? AtomicExpansionKind::BitTestIntrinsic
32445 : AtomicExpansionKind::CmpXChg;
32446 }
32447 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
32448 : AtomicExpansionKind::CmpXChg;
32449 }
32450
32451 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32451, __extension__
__PRETTY_FUNCTION__))
;
32452
32453 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
32454 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
32455 return AtomicExpansionKind::CmpXChg;
32456
32457 assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32457, __extension__
__PRETTY_FUNCTION__))
;
32458
32459 // If shift amounts are not the same we can't use BitTestIntrinsic.
32460 if (BitChange.first != BitTested.first)
32461 return AtomicExpansionKind::CmpXChg;
32462
32463 // If atomic AND need to be masking all be one bit and testing the one bit
32464 // unset in the mask.
32465 if (AI->getOperation() == AtomicRMWInst::And)
32466 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
32467 ? AtomicExpansionKind::BitTestIntrinsic
32468 : AtomicExpansionKind::CmpXChg;
32469
32470 // If atomic XOR/OR need to be setting and testing the same bit.
32471 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
32472 ? AtomicExpansionKind::BitTestIntrinsic
32473 : AtomicExpansionKind::CmpXChg;
32474}
32475
32476void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
32477 IRBuilder<> Builder(AI);
32478 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32479 Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
32480 Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
32481 switch (AI->getOperation()) {
32482 default:
32483 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32483)
;
32484 case AtomicRMWInst::Or:
32485 IID_C = Intrinsic::x86_atomic_bts;
32486 IID_I = Intrinsic::x86_atomic_bts_rm;
32487 break;
32488 case AtomicRMWInst::Xor:
32489 IID_C = Intrinsic::x86_atomic_btc;
32490 IID_I = Intrinsic::x86_atomic_btc_rm;
32491 break;
32492 case AtomicRMWInst::And:
32493 IID_C = Intrinsic::x86_atomic_btr;
32494 IID_I = Intrinsic::x86_atomic_btr_rm;
32495 break;
32496 }
32497 Instruction *I = AI->user_back();
32498 LLVMContext &Ctx = AI->getContext();
32499 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32500 Type::getInt8PtrTy(Ctx));
32501 Function *BitTest = nullptr;
32502 Value *Result = nullptr;
32503 auto BitTested = FindSingleBitChange(AI->getValOperand());
32504 assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32504, __extension__ __PRETTY_FUNCTION__))
;
32505
32506 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
32507 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
32508
32509 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
32510
32511 unsigned Imm = llvm::countr_zero(C->getZExtValue());
32512 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
32513 } else {
32514 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
32515
32516 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32516, __extension__
__PRETTY_FUNCTION__))
;
32517
32518 Value *SI = BitTested.first;
32519 assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
32519, __extension__ __PRETTY_FUNCTION__))
;
32520
32521 // BT{S|R|C} on memory operand don't modulo bit position so we need to
32522 // mask it.
32523 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
32524 Value *BitPos =
32525 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
32526 // Todo(1): In many cases it may be provable that SI is less than
32527 // ShiftBits in which case this mask is unnecessary
32528 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
32529 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
32530 // favor of just a raw BT{S|R|C}.
32531
32532 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
32533 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
32534
32535 // If the result is only used for zero/non-zero status then we don't need to
32536 // shift value back. Otherwise do so.
32537 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
32538 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
32539 if (ICmp->isEquality()) {
32540 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
32541 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
32542 if (C0 || C1) {
32543 assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32543, __extension__ __PRETTY_FUNCTION__))
;
32544 if ((C0 ? C0 : C1)->isZero())
32545 continue;
32546 }
32547 }
32548 }
32549 Result = Builder.CreateShl(Result, BitPos);
32550 break;
32551 }
32552 }
32553
32554 I->replaceAllUsesWith(Result);
32555 I->eraseFromParent();
32556 AI->eraseFromParent();
32557}
32558
32559static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
32560 using namespace llvm::PatternMatch;
32561 if (!AI->hasOneUse())
32562 return false;
32563
32564 Value *Op = AI->getOperand(1);
32565 ICmpInst::Predicate Pred;
32566 Instruction *I = AI->user_back();
32567 AtomicRMWInst::BinOp Opc = AI->getOperation();
32568 if (Opc == AtomicRMWInst::Add) {
32569 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32570 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32571 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32572 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32573 return Pred == CmpInst::ICMP_SLT;
32574 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32575 return Pred == CmpInst::ICMP_SGT;
32576 }
32577 return false;
32578 }
32579 if (Opc == AtomicRMWInst::Sub) {
32580 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32581 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32582 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32583 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32584 return Pred == CmpInst::ICMP_SLT;
32585 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32586 return Pred == CmpInst::ICMP_SGT;
32587 }
32588 return false;
32589 }
32590 if ((Opc == AtomicRMWInst::Or &&
32591 match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
32592 (Opc == AtomicRMWInst::And &&
32593 match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
32594 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32595 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32596 Pred == CmpInst::ICMP_SLT;
32597 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32598 return Pred == CmpInst::ICMP_SGT;
32599 return false;
32600 }
32601 if (Opc == AtomicRMWInst::Xor) {
32602 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32603 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32604 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32605 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32606 return Pred == CmpInst::ICMP_SLT;
32607 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32608 return Pred == CmpInst::ICMP_SGT;
32609 }
32610 return false;
32611 }
32612
32613 return false;
32614}
32615
32616void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32617 AtomicRMWInst *AI) const {
32618 IRBuilder<> Builder(AI);
32619 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32620 Instruction *TempI = nullptr;
32621 LLVMContext &Ctx = AI->getContext();
32622 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32623 if (!ICI) {
32624 TempI = AI->user_back();
32625 assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32625, __extension__
__PRETTY_FUNCTION__))
;
32626 ICI = cast<ICmpInst>(TempI->user_back());
32627 }
32628 X86::CondCode CC = X86::COND_INVALID;
32629 ICmpInst::Predicate Pred = ICI->getPredicate();
32630 switch (Pred) {
32631 default:
32632 llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32632)
;
32633 case CmpInst::ICMP_EQ:
32634 CC = X86::COND_E;
32635 break;
32636 case CmpInst::ICMP_NE:
32637 CC = X86::COND_NE;
32638 break;
32639 case CmpInst::ICMP_SLT:
32640 CC = X86::COND_S;
32641 break;
32642 case CmpInst::ICMP_SGT:
32643 CC = X86::COND_NS;
32644 break;
32645 }
32646 Intrinsic::ID IID = Intrinsic::not_intrinsic;
32647 switch (AI->getOperation()) {
32648 default:
32649 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32649)
;
32650 case AtomicRMWInst::Add:
32651 IID = Intrinsic::x86_atomic_add_cc;
32652 break;
32653 case AtomicRMWInst::Sub:
32654 IID = Intrinsic::x86_atomic_sub_cc;
32655 break;
32656 case AtomicRMWInst::Or:
32657 IID = Intrinsic::x86_atomic_or_cc;
32658 break;
32659 case AtomicRMWInst::And:
32660 IID = Intrinsic::x86_atomic_and_cc;
32661 break;
32662 case AtomicRMWInst::Xor:
32663 IID = Intrinsic::x86_atomic_xor_cc;
32664 break;
32665 }
32666 Function *CmpArith =
32667 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
32668 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32669 Type::getInt8PtrTy(Ctx));
32670 Value *Call = Builder.CreateCall(
32671 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32672 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32673 ICI->replaceAllUsesWith(Result);
32674 ICI->eraseFromParent();
32675 if (TempI)
32676 TempI->eraseFromParent();
32677 AI->eraseFromParent();
32678}
32679
32680TargetLowering::AtomicExpansionKind
32681X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32682 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32683 Type *MemType = AI->getType();
32684
32685 // If the operand is too big, we must see if cmpxchg8/16b is available
32686 // and default to library calls otherwise.
32687 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32688 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32689 : AtomicExpansionKind::None;
32690 }
32691
32692 AtomicRMWInst::BinOp Op = AI->getOperation();
32693 switch (Op) {
32694 case AtomicRMWInst::Xchg:
32695 return AtomicExpansionKind::None;
32696 case AtomicRMWInst::Add:
32697 case AtomicRMWInst::Sub:
32698 if (shouldExpandCmpArithRMWInIR(AI))
32699 return AtomicExpansionKind::CmpArithIntrinsic;
32700 // It's better to use xadd, xsub or xchg for these in other cases.
32701 return AtomicExpansionKind::None;
32702 case AtomicRMWInst::Or:
32703 case AtomicRMWInst::And:
32704 case AtomicRMWInst::Xor:
32705 if (shouldExpandCmpArithRMWInIR(AI))
32706 return AtomicExpansionKind::CmpArithIntrinsic;
32707 return shouldExpandLogicAtomicRMWInIR(AI);
32708 case AtomicRMWInst::Nand:
32709 case AtomicRMWInst::Max:
32710 case AtomicRMWInst::Min:
32711 case AtomicRMWInst::UMax:
32712 case AtomicRMWInst::UMin:
32713 case AtomicRMWInst::FAdd:
32714 case AtomicRMWInst::FSub:
32715 case AtomicRMWInst::FMax:
32716 case AtomicRMWInst::FMin:
32717 case AtomicRMWInst::UIncWrap:
32718 case AtomicRMWInst::UDecWrap:
32719 default:
32720 // These always require a non-trivial set of data operations on x86. We must
32721 // use a cmpxchg loop.
32722 return AtomicExpansionKind::CmpXChg;
32723 }
32724}
32725
32726LoadInst *
32727X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32728 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32729 Type *MemType = AI->getType();
32730 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32731 // there is no benefit in turning such RMWs into loads, and it is actually
32732 // harmful as it introduces a mfence.
32733 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32734 return nullptr;
32735
32736 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32737 // lowering available in lowerAtomicArith.
32738 // TODO: push more cases through this path.
32739 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32740 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32741 AI->use_empty())
32742 return nullptr;
32743
32744 IRBuilder<> Builder(AI);
32745 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32746 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
32747 auto SSID = AI->getSyncScopeID();
32748 // We must restrict the ordering to avoid generating loads with Release or
32749 // ReleaseAcquire orderings.
32750 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
32751
32752 // Before the load we need a fence. Here is an example lifted from
32753 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32754 // is required:
32755 // Thread 0:
32756 // x.store(1, relaxed);
32757 // r1 = y.fetch_add(0, release);
32758 // Thread 1:
32759 // y.fetch_add(42, acquire);
32760 // r2 = x.load(relaxed);
32761 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32762 // lowered to just a load without a fence. A mfence flushes the store buffer,
32763 // making the optimization clearly correct.
32764 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32765 // otherwise, we might be able to be more aggressive on relaxed idempotent
32766 // rmw. In practice, they do not look useful, so we don't try to be
32767 // especially clever.
32768 if (SSID == SyncScope::SingleThread)
32769 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
32770 // the IR level, so we must wrap it in an intrinsic.
32771 return nullptr;
32772
32773 if (!Subtarget.hasMFence())
32774 // FIXME: it might make sense to use a locked operation here but on a
32775 // different cache-line to prevent cache-line bouncing. In practice it
32776 // is probably a small win, and x86 processors without mfence are rare
32777 // enough that we do not bother.
32778 return nullptr;
32779
32780 Function *MFence =
32781 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
32782 Builder.CreateCall(MFence, {});
32783
32784 // Finally we can emit the atomic load.
32785 LoadInst *Loaded = Builder.CreateAlignedLoad(
32786 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32787 Loaded->setAtomic(Order, SSID);
32788 AI->replaceAllUsesWith(Loaded);
32789 AI->eraseFromParent();
32790 return Loaded;
32791}
32792
32793bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
32794 if (!SI.isUnordered())
32795 return false;
32796 return ExperimentalUnorderedISEL;
32797}
32798bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
32799 if (!LI.isUnordered())
32800 return false;
32801 return ExperimentalUnorderedISEL;
32802}
32803
32804
32805/// Emit a locked operation on a stack location which does not change any
32806/// memory location, but does involve a lock prefix. Location is chosen to be
32807/// a) very likely accessed only by a single thread to minimize cache traffic,
32808/// and b) definitely dereferenceable. Returns the new Chain result.
32809static SDValue emitLockedStackOp(SelectionDAG &DAG,
32810 const X86Subtarget &Subtarget, SDValue Chain,
32811 const SDLoc &DL) {
32812 // Implementation notes:
32813 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32814 // operations issued by the current processor. As such, the location
32815 // referenced is not relevant for the ordering properties of the instruction.
32816 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32817 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32818 // 2) Using an immediate operand appears to be the best encoding choice
32819 // here since it doesn't require an extra register.
32820 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32821 // is small enough it might just be measurement noise.)
32822 // 4) When choosing offsets, there are several contributing factors:
32823 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32824 // line aligned stack object to improve this case.)
32825 // b) To minimize our chances of introducing a false dependence, we prefer
32826 // to offset the stack usage from TOS slightly.
32827 // c) To minimize concerns about cross thread stack usage - in particular,
32828 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32829 // captures state in the TOS frame and accesses it from many threads -
32830 // we want to use an offset such that the offset is in a distinct cache
32831 // line from the TOS frame.
32832 //
32833 // For a general discussion of the tradeoffs and benchmark results, see:
32834 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32835
32836 auto &MF = DAG.getMachineFunction();
32837 auto &TFL = *Subtarget.getFrameLowering();
32838 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32839
32840 if (Subtarget.is64Bit()) {
32841 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32842 SDValue Ops[] = {
32843 DAG.getRegister(X86::RSP, MVT::i64), // Base
32844 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32845 DAG.getRegister(0, MVT::i64), // Index
32846 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32847 DAG.getRegister(0, MVT::i16), // Segment.
32848 Zero,
32849 Chain};
32850 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32851 MVT::Other, Ops);
32852 return SDValue(Res, 1);
32853 }
32854
32855 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32856 SDValue Ops[] = {
32857 DAG.getRegister(X86::ESP, MVT::i32), // Base
32858 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32859 DAG.getRegister(0, MVT::i32), // Index
32860 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32861 DAG.getRegister(0, MVT::i16), // Segment.
32862 Zero,
32863 Chain
32864 };
32865 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32866 MVT::Other, Ops);
32867 return SDValue(Res, 1);
32868}
32869
32870static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
32871 SelectionDAG &DAG) {
32872 SDLoc dl(Op);
32873 AtomicOrdering FenceOrdering =
32874 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32875 SyncScope::ID FenceSSID =
32876 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32877
32878 // The only fence that needs an instruction is a sequentially-consistent
32879 // cross-thread fence.
32880 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32881 FenceSSID == SyncScope::System) {
32882 if (Subtarget.hasMFence())
32883 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32884
32885 SDValue Chain = Op.getOperand(0);
32886 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32887 }
32888
32889 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32890 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32891}
32892
32893static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
32894 SelectionDAG &DAG) {
32895 MVT T = Op.getSimpleValueType();
32896 SDLoc DL(Op);
32897 unsigned Reg = 0;
32898 unsigned size = 0;
32899 switch(T.SimpleTy) {
32900 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32900)
;
32901 case MVT::i8: Reg = X86::AL; size = 1; break;
32902 case MVT::i16: Reg = X86::AX; size = 2; break;
32903 case MVT::i32: Reg = X86::EAX; size = 4; break;
32904 case MVT::i64:
32905 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32905, __extension__
__PRETTY_FUNCTION__))
;
32906 Reg = X86::RAX; size = 8;
32907 break;
32908 }
32909 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32910 Op.getOperand(2), SDValue());
32911 SDValue Ops[] = { cpIn.getValue(0),
32912 Op.getOperand(1),
32913 Op.getOperand(3),
32914 DAG.getTargetConstant(size, DL, MVT::i8),
32915 cpIn.getValue(1) };
32916 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32917 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32918 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
32919 Ops, T, MMO);
32920
32921 SDValue cpOut =
32922 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32923 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32924 MVT::i32, cpOut.getValue(2));
32925 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32926
32927 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32928 cpOut, Success, EFLAGS.getValue(1));
32929}
32930
32931// Create MOVMSKB, taking into account whether we need to split for AVX1.
32932static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
32933 const X86Subtarget &Subtarget) {
32934 MVT InVT = V.getSimpleValueType();
32935
32936 if (InVT == MVT::v64i8) {
32937 SDValue Lo, Hi;
32938 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32939 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32940 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32941 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32942 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32943 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32944 DAG.getConstant(32, DL, MVT::i8));
32945 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32946 }
32947 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32948 SDValue Lo, Hi;
32949 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32950 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32951 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32952 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32953 DAG.getConstant(16, DL, MVT::i8));
32954 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32955 }
32956
32957 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32958}
32959
32960static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32961 SelectionDAG &DAG) {
32962 SDValue Src = Op.getOperand(0);
32963 MVT SrcVT = Src.getSimpleValueType();
32964 MVT DstVT = Op.getSimpleValueType();
32965
32966 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32967 // half to v32i1 and concatenating the result.
32968 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32969 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32969, __extension__
__PRETTY_FUNCTION__))
;
32970 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32970, __extension__
__PRETTY_FUNCTION__))
;
32971 SDLoc dl(Op);
32972 SDValue Lo, Hi;
32973 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32974 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32975 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32976 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32977 }
32978
32979 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32980 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32981 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__
__PRETTY_FUNCTION__))
;
32982 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32983 SDLoc DL(Op);
32984 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32985 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32986 return DAG.getZExtOrTrunc(V, DL, DstVT);
32987 }
32988
32989 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__
__PRETTY_FUNCTION__))
32990 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__
__PRETTY_FUNCTION__))
;
32991
32992 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32992, __extension__
__PRETTY_FUNCTION__))
;
32993 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32994 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32995 // This conversion needs to be expanded.
32996 return SDValue();
32997
32998 SDLoc dl(Op);
32999 if (SrcVT.isVector()) {
33000 // Widen the vector in input in the case of MVT::v2i32.
33001 // Example: from MVT::v2i32 to MVT::v4i32.
33002 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
33003 SrcVT.getVectorNumElements() * 2);
33004 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
33005 DAG.getUNDEF(SrcVT));
33006 } else {
33007 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__
__PRETTY_FUNCTION__))
33008 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__
__PRETTY_FUNCTION__))
;
33009 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
33010 }
33011
33012 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
33013 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
33014
33015 if (DstVT == MVT::x86mmx)
33016 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
33017
33018 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
33019 DAG.getIntPtrConstant(0, dl));
33020}
33021
33022/// Compute the horizontal sum of bytes in V for the elements of VT.
33023///
33024/// Requires V to be a byte vector and VT to be an integer vector type with
33025/// wider elements than V's type. The width of the elements of VT determines
33026/// how many bytes of V are summed horizontally to produce each element of the
33027/// result.
33028static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
33029 const X86Subtarget &Subtarget,
33030 SelectionDAG &DAG) {
33031 SDLoc DL(V);
33032 MVT ByteVecVT = V.getSimpleValueType();
33033 MVT EltVT = VT.getVectorElementType();
33034 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__
__PRETTY_FUNCTION__))
33035 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__
__PRETTY_FUNCTION__))
;
33036 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__
__PRETTY_FUNCTION__))
33037 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__
__PRETTY_FUNCTION__))
;
33038 unsigned VecSize = VT.getSizeInBits();
33039 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33039, __extension__
__PRETTY_FUNCTION__))
;
33040
33041 // PSADBW instruction horizontally add all bytes and leave the result in i64
33042 // chunks, thus directly computes the pop count for v2i64 and v4i64.
33043 if (EltVT == MVT::i64) {
33044 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
33045 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
33046 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
33047 return DAG.getBitcast(VT, V);
33048 }
33049
33050 if (EltVT == MVT::i32) {
33051 // We unpack the low half and high half into i32s interleaved with zeros so
33052 // that we can use PSADBW to horizontally sum them. The most useful part of
33053 // this is that it lines up the results of two PSADBW instructions to be
33054 // two v2i64 vectors which concatenated are the 4 population counts. We can
33055 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
33056 SDValue Zeros = DAG.getConstant(0, DL, VT);
33057 SDValue V32 = DAG.getBitcast(VT, V);
33058 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
33059 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
33060
33061 // Do the horizontal sums into two v2i64s.
33062 Zeros = DAG.getConstant(0, DL, ByteVecVT);
33063 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
33064 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
33065 DAG.getBitcast(ByteVecVT, Low), Zeros);
33066 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
33067 DAG.getBitcast(ByteVecVT, High), Zeros);
33068
33069 // Merge them together.
33070 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
33071 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
33072 DAG.getBitcast(ShortVecVT, Low),
33073 DAG.getBitcast(ShortVecVT, High));
33074
33075 return DAG.getBitcast(VT, V);
33076 }
33077
33078 // The only element type left is i16.
33079 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33079, __extension__
__PRETTY_FUNCTION__))
;
33080
33081 // To obtain pop count for each i16 element starting from the pop count for
33082 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
33083 // right by 8. It is important to shift as i16s as i8 vector shift isn't
33084 // directly supported.
33085 SDValue ShifterV = DAG.getConstant(8, DL, VT);
33086 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
33087 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
33088 DAG.getBitcast(ByteVecVT, V));
33089 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
33090}
33091
33092static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
33093 const X86Subtarget &Subtarget,
33094 SelectionDAG &DAG) {
33095 MVT VT = Op.getSimpleValueType();
33096 MVT EltVT = VT.getVectorElementType();
33097 int NumElts = VT.getVectorNumElements();
33098 (void)EltVT;
33099 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33099, __extension__
__PRETTY_FUNCTION__))
;
33100
33101 // Implement a lookup table in register by using an algorithm based on:
33102 // http://wm.ite.pl/articles/sse-popcount.html
33103 //
33104 // The general idea is that every lower byte nibble in the input vector is an
33105 // index into a in-register pre-computed pop count table. We then split up the
33106 // input vector in two new ones: (1) a vector with only the shifted-right
33107 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
33108 // masked out higher ones) for each byte. PSHUFB is used separately with both
33109 // to index the in-register table. Next, both are added and the result is a
33110 // i8 vector where each element contains the pop count for input byte.
33111 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
33112 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
33113 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
33114 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
33115
33116 SmallVector<SDValue, 64> LUTVec;
33117 for (int i = 0; i < NumElts; ++i)
33118 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
33119 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
33120 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
33121
33122 // High nibbles
33123 SDValue FourV = DAG.getConstant(4, DL, VT);
33124 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
33125
33126 // Low nibbles
33127 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
33128
33129 // The input vector is used as the shuffle mask that index elements into the
33130 // LUT. After counting low and high nibbles, add the vector to obtain the
33131 // final pop count per i8 element.
33132 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
33133 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
33134 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
33135}
33136
33137// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
33138// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
33139static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
33140 SelectionDAG &DAG) {
33141 MVT VT = Op.getSimpleValueType();
33142 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__
__PRETTY_FUNCTION__))
33143 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__
__PRETTY_FUNCTION__))
;
33144 SDLoc DL(Op.getNode());
33145 SDValue Op0 = Op.getOperand(0);
33146
33147 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
33148 if (Subtarget.hasVPOPCNTDQ()) {
33149 unsigned NumElems = VT.getVectorNumElements();
33150 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__
__PRETTY_FUNCTION__))
33151 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__
__PRETTY_FUNCTION__))
;
33152 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
33153 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
33154 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
33155 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
33156 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
33157 }
33158 }
33159
33160 // Decompose 256-bit ops into smaller 128-bit ops.
33161 if (VT.is256BitVector() && !Subtarget.hasInt256())
33162 return splitVectorIntUnary(Op, DAG);
33163
33164 // Decompose 512-bit ops into smaller 256-bit ops.
33165 if (VT.is512BitVector() && !Subtarget.hasBWI())
33166 return splitVectorIntUnary(Op, DAG);
33167
33168 // For element types greater than i8, do vXi8 pop counts and a bytesum.
33169 if (VT.getScalarType() != MVT::i8) {
33170 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
33171 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
33172 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
33173 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
33174 }
33175
33176 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
33177 if (!Subtarget.hasSSSE3())
33178 return SDValue();
33179
33180 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
33181}
33182
33183static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
33184 SelectionDAG &DAG) {
33185 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__))
33186 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__))
;
33187 return LowerVectorCTPOP(Op, Subtarget, DAG);
33188}
33189
33190static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
33191 MVT VT = Op.getSimpleValueType();
33192 SDValue In = Op.getOperand(0);
33193 SDLoc DL(Op);
33194
33195 // For scalars, its still beneficial to transfer to/from the SIMD unit to
33196 // perform the BITREVERSE.
33197 if (!VT.isVector()) {
33198 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
33199 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
33200 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
33201 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
33202 DAG.getIntPtrConstant(0, DL));
33203 }
33204
33205 int NumElts = VT.getVectorNumElements();
33206 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
33207
33208 // Decompose 256-bit ops into smaller 128-bit ops.
33209 if (VT.is256BitVector())
33210 return splitVectorIntUnary(Op, DAG);
33211
33212 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__
__PRETTY_FUNCTION__))
33213 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__
__PRETTY_FUNCTION__))
;
33214
33215 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
33216 // perform the BSWAP in the shuffle.
33217 // Its best to shuffle using the second operand as this will implicitly allow
33218 // memory folding for multiple vectors.
33219 SmallVector<SDValue, 16> MaskElts;
33220 for (int i = 0; i != NumElts; ++i) {
33221 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
33222 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
33223 int PermuteByte = SourceByte | (2 << 5);
33224 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
33225 }
33226 }
33227
33228 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
33229 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
33230 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
33231 Res, Mask);
33232 return DAG.getBitcast(VT, Res);
33233}
33234
33235static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
33236 SelectionDAG &DAG) {
33237 MVT VT = Op.getSimpleValueType();
33238
33239 if (Subtarget.hasXOP() && !VT.is512BitVector())
33240 return LowerBITREVERSE_XOP(Op, DAG);
33241
33242 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33242, __extension__
__PRETTY_FUNCTION__))
;
33243
33244 SDValue In = Op.getOperand(0);
33245 SDLoc DL(Op);
33246
33247 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__
__PRETTY_FUNCTION__))
33248 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__
__PRETTY_FUNCTION__))
;
33249
33250 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
33251 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
33252 return splitVectorIntUnary(Op, DAG);
33253
33254 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
33255 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
33256 return splitVectorIntUnary(Op, DAG);
33257
33258 unsigned NumElts = VT.getVectorNumElements();
33259
33260 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
33261 if (Subtarget.hasGFNI()) {
33262 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
33263 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
33264 Matrix = DAG.getBitcast(VT, Matrix);
33265 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
33266 DAG.getTargetConstant(0, DL, MVT::i8));
33267 }
33268
33269 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
33270 // two nibbles and a PSHUFB lookup to find the bitreverse of each
33271 // 0-15 value (moved to the other nibble).
33272 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
33273 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
33274 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
33275
33276 const int LoLUT[16] = {
33277 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
33278 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
33279 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
33280 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
33281 const int HiLUT[16] = {
33282 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
33283 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
33284 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
33285 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
33286
33287 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
33288 for (unsigned i = 0; i < NumElts; ++i) {
33289 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
33290 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
33291 }
33292
33293 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
33294 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
33295 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
33296 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
33297 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
33298}
33299
33300static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
33301 SelectionDAG &DAG) {
33302 SDLoc DL(Op);
33303 SDValue X = Op.getOperand(0);
33304 MVT VT = Op.getSimpleValueType();
33305
33306 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
33307 if (VT == MVT::i8 ||
33308 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
33309 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33310 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
33311 DAG.getConstant(0, DL, MVT::i8));
33312 // Copy the inverse of the parity flag into a register with setcc.
33313 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33314 // Extend to the original type.
33315 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33316 }
33317
33318 // If we have POPCNT, use the default expansion.
33319 if (Subtarget.hasPOPCNT())
33320 return SDValue();
33321
33322 if (VT == MVT::i64) {
33323 // Xor the high and low 16-bits together using a 32-bit operation.
33324 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
33325 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
33326 DAG.getConstant(32, DL, MVT::i8)));
33327 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
33328 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
33329 }
33330
33331 if (VT != MVT::i16) {
33332 // Xor the high and low 16-bits together using a 32-bit operation.
33333 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
33334 DAG.getConstant(16, DL, MVT::i8));
33335 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
33336 } else {
33337 // If the input is 16-bits, we need to extend to use an i32 shift below.
33338 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
33339 }
33340
33341 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
33342 // This should allow an h-reg to be used to save a shift.
33343 SDValue Hi = DAG.getNode(
33344 ISD::TRUNCATE, DL, MVT::i8,
33345 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
33346 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33347 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
33348 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
33349
33350 // Copy the inverse of the parity flag into a register with setcc.
33351 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33352 // Extend to the original type.
33353 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33354}
33355
33356static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
33357 const X86Subtarget &Subtarget) {
33358 unsigned NewOpc = 0;
33359 switch (N->getOpcode()) {
33360 case ISD::ATOMIC_LOAD_ADD:
33361 NewOpc = X86ISD::LADD;
33362 break;
33363 case ISD::ATOMIC_LOAD_SUB:
33364 NewOpc = X86ISD::LSUB;
33365 break;
33366 case ISD::ATOMIC_LOAD_OR:
33367 NewOpc = X86ISD::LOR;
33368 break;
33369 case ISD::ATOMIC_LOAD_XOR:
33370 NewOpc = X86ISD::LXOR;
33371 break;
33372 case ISD::ATOMIC_LOAD_AND:
33373 NewOpc = X86ISD::LAND;
33374 break;
33375 default:
33376 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33376)
;
33377 }
33378
33379 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33380
33381 return DAG.getMemIntrinsicNode(
33382 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
33383 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
33384 /*MemVT=*/N->getSimpleValueType(0), MMO);
33385}
33386
33387/// Lower atomic_load_ops into LOCK-prefixed operations.
33388static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
33389 const X86Subtarget &Subtarget) {
33390 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
33391 SDValue Chain = N->getOperand(0);
33392 SDValue LHS = N->getOperand(1);
33393 SDValue RHS = N->getOperand(2);
33394 unsigned Opc = N->getOpcode();
33395 MVT VT = N->getSimpleValueType(0);
33396 SDLoc DL(N);
33397
33398 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
33399 // can only be lowered when the result is unused. They should have already
33400 // been transformed into a cmpxchg loop in AtomicExpand.
33401 if (N->hasAnyUseOfValue(0)) {
33402 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
33403 // select LXADD if LOCK_SUB can't be selected.
33404 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
33405 // can use LXADD as opposed to cmpxchg.
33406 if (Opc == ISD::ATOMIC_LOAD_SUB ||
33407 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) {
33408 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
33409 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,
33410 AN->getMemOperand());
33411 }
33412 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__
__PRETTY_FUNCTION__))
33413 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__
__PRETTY_FUNCTION__))
;
33414 return N;
33415 }
33416
33417 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
33418 // The core idea here is that since the memory location isn't actually
33419 // changing, all we need is a lowering for the *ordering* impacts of the
33420 // atomicrmw. As such, we can chose a different operation and memory
33421 // location to minimize impact on other code.
33422 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
33423 // On X86, the only ordering which actually requires an instruction is
33424 // seq_cst which isn't SingleThread, everything just needs to be preserved
33425 // during codegen and then dropped. Note that we expect (but don't assume),
33426 // that orderings other than seq_cst and acq_rel have been canonicalized to
33427 // a store or load.
33428 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
33429 AN->getSyncScopeID() == SyncScope::System) {
33430 // Prefer a locked operation against a stack location to minimize cache
33431 // traffic. This assumes that stack locations are very likely to be
33432 // accessed only by the owning thread.
33433 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
33434 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33434, __extension__ __PRETTY_FUNCTION__))
;
33435 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33436 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33437 DAG.getUNDEF(VT), NewChain);
33438 }
33439 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
33440 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
33441 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33441, __extension__ __PRETTY_FUNCTION__))
;
33442 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33443 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33444 DAG.getUNDEF(VT), NewChain);
33445 }
33446
33447 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
33448 // RAUW the chain, but don't worry about the result, as it's unused.
33449 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33449, __extension__ __PRETTY_FUNCTION__))
;
33450 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33451 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33452 DAG.getUNDEF(VT), LockOp.getValue(1));
33453}
33454
33455static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
33456 const X86Subtarget &Subtarget) {
33457 auto *Node = cast<AtomicSDNode>(Op.getNode());
33458 SDLoc dl(Node);
33459 EVT VT = Node->getMemoryVT();
33460
33461 bool IsSeqCst =
33462 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
33463 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
33464
33465 // If this store is not sequentially consistent and the type is legal
33466 // we can just keep it.
33467 if (!IsSeqCst && IsTypeLegal)
33468 return Op;
33469
33470 if (VT == MVT::i64 && !IsTypeLegal) {
33471 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33472 // is enabled.
33473 bool NoImplicitFloatOps =
33474 DAG.getMachineFunction().getFunction().hasFnAttribute(
33475 Attribute::NoImplicitFloat);
33476 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33477 SDValue Chain;
33478 if (Subtarget.hasSSE1()) {
33479 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
33480 Node->getOperand(2));
33481 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33482 SclToVec = DAG.getBitcast(StVT, SclToVec);
33483 SDVTList Tys = DAG.getVTList(MVT::Other);
33484 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33485 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33486 MVT::i64, Node->getMemOperand());
33487 } else if (Subtarget.hasX87()) {
33488 // First load this into an 80-bit X87 register using a stack temporary.
33489 // This will put the whole integer into the significand.
33490 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33491 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33492 MachinePointerInfo MPI =
33493 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
33494 Chain =
33495 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
33496 MPI, MaybeAlign(), MachineMemOperand::MOStore);
33497 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33498 SDValue LdOps[] = {Chain, StackPtr};
33499 SDValue Value = DAG.getMemIntrinsicNode(
33500 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33501 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33502 Chain = Value.getValue(1);
33503
33504 // Now use an FIST to do the atomic store.
33505 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33506 Chain =
33507 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33508 StoreOps, MVT::i64, Node->getMemOperand());
33509 }
33510
33511 if (Chain) {
33512 // If this is a sequentially consistent store, also emit an appropriate
33513 // barrier.
33514 if (IsSeqCst)
33515 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33516
33517 return Chain;
33518 }
33519 }
33520 }
33521
33522 // Convert seq_cst store -> xchg
33523 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33524 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33525 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
33526 Node->getMemoryVT(),
33527 Node->getOperand(0),
33528 Node->getOperand(1), Node->getOperand(2),
33529 Node->getMemOperand());
33530 return Swap.getValue(1);
33531}
33532
33533static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
33534 SDNode *N = Op.getNode();
33535 MVT VT = N->getSimpleValueType(0);
33536 unsigned Opc = Op.getOpcode();
33537
33538 // Let legalize expand this if it isn't a legal type yet.
33539 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33540 return SDValue();
33541
33542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33543 SDLoc DL(N);
33544
33545 // Set the carry flag.
33546 SDValue Carry = Op.getOperand(2);
33547 EVT CarryVT = Carry.getValueType();
33548 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33549 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33550
33551 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33552 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33553 Op.getOperand(0), Op.getOperand(1),
33554 Carry.getValue(1));
33555
33556 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33557 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33558 Sum.getValue(1), DL, DAG);
33559 if (N->getValueType(1) == MVT::i1)
33560 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33561
33562 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33563}
33564
33565static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33566 SelectionDAG &DAG) {
33567 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33567, __extension__
__PRETTY_FUNCTION__))
;
33568
33569 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33570 // which returns the values as { float, float } (in XMM0) or
33571 // { double, double } (which is returned in XMM0, XMM1).
33572 SDLoc dl(Op);
33573 SDValue Arg = Op.getOperand(0);
33574 EVT ArgVT = Arg.getValueType();
33575 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33576
33577 TargetLowering::ArgListTy Args;
33578 TargetLowering::ArgListEntry Entry;
33579
33580 Entry.Node = Arg;
33581 Entry.Ty = ArgTy;
33582 Entry.IsSExt = false;
33583 Entry.IsZExt = false;
33584 Args.push_back(Entry);
33585
33586 bool isF64 = ArgVT == MVT::f64;
33587 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33588 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33589 // the results are returned via SRet in memory.
33590 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33591 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33592 const char *LibcallName = TLI.getLibcallName(LC);
33593 SDValue Callee =
33594 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33595
33596 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33597 : (Type *)FixedVectorType::get(ArgTy, 4);
33598
33599 TargetLowering::CallLoweringInfo CLI(DAG);
33600 CLI.setDebugLoc(dl)
33601 .setChain(DAG.getEntryNode())
33602 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33603
33604 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33605
33606 if (isF64)
33607 // Returned in xmm0 and xmm1.
33608 return CallResult.first;
33609
33610 // Returned in bits 0:31 and 32:64 xmm0.
33611 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33612 CallResult.first, DAG.getIntPtrConstant(0, dl));
33613 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33614 CallResult.first, DAG.getIntPtrConstant(1, dl));
33615 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33616 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33617}
33618
33619/// Widen a vector input to a vector of NVT. The
33620/// input vector must have the same element type as NVT.
33621static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
33622 bool FillWithZeroes = false) {
33623 // Check if InOp already has the right width.
33624 MVT InVT = InOp.getSimpleValueType();
33625 if (InVT == NVT)
33626 return InOp;
33627
33628 if (InOp.isUndef())
33629 return DAG.getUNDEF(NVT);
33630
33631 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__
__PRETTY_FUNCTION__))
33632 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__
__PRETTY_FUNCTION__))
;
33633
33634 unsigned InNumElts = InVT.getVectorNumElements();
33635 unsigned WidenNumElts = NVT.getVectorNumElements();
33636 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__
__PRETTY_FUNCTION__))
33637 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__
__PRETTY_FUNCTION__))
;
33638
33639 SDLoc dl(InOp);
33640 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
33641 InOp.getNumOperands() == 2) {
33642 SDValue N1 = InOp.getOperand(1);
33643 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33644 N1.isUndef()) {
33645 InOp = InOp.getOperand(0);
33646 InVT = InOp.getSimpleValueType();
33647 InNumElts = InVT.getVectorNumElements();
33648 }
33649 }
33650 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
33651 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
33652 SmallVector<SDValue, 16> Ops;
33653 for (unsigned i = 0; i < InNumElts; ++i)
33654 Ops.push_back(InOp.getOperand(i));
33655
33656 EVT EltVT = InOp.getOperand(0).getValueType();
33657
33658 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
33659 DAG.getUNDEF(EltVT);
33660 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
33661 Ops.push_back(FillVal);
33662 return DAG.getBuildVector(NVT, dl, Ops);
33663 }
33664 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
33665 DAG.getUNDEF(NVT);
33666 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
33667 InOp, DAG.getIntPtrConstant(0, dl));
33668}
33669
33670static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
33671 SelectionDAG &DAG) {
33672 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__
__PRETTY_FUNCTION__))
33673 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__
__PRETTY_FUNCTION__))
;
33674
33675 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
33676 SDValue Src = N->getValue();
33677 MVT VT = Src.getSimpleValueType();
33678 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33678, __extension__
__PRETTY_FUNCTION__))
;
33679 SDLoc dl(Op);
33680
33681 SDValue Scale = N->getScale();
33682 SDValue Index = N->getIndex();
33683 SDValue Mask = N->getMask();
33684 SDValue Chain = N->getChain();
33685 SDValue BasePtr = N->getBasePtr();
33686
33687 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33688 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33688, __extension__
__PRETTY_FUNCTION__))
;
33689 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33690 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33691 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33692 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33693 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33694 SDVTList VTs = DAG.getVTList(MVT::Other);
33695 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33696 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33697 N->getMemoryVT(), N->getMemOperand());
33698 }
33699 return SDValue();
33700 }
33701
33702 MVT IndexVT = Index.getSimpleValueType();
33703
33704 // If the index is v2i32, we're being called by type legalization and we
33705 // should just let the default handling take care of it.
33706 if (IndexVT == MVT::v2i32)
33707 return SDValue();
33708
33709 // If we don't have VLX and neither the passthru or index is 512-bits, we
33710 // need to widen until one is.
33711 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33712 !Index.getSimpleValueType().is512BitVector()) {
33713 // Determine how much we need to widen by to get a 512-bit type.
33714 unsigned Factor = std::min(512/VT.getSizeInBits(),
33715 512/IndexVT.getSizeInBits());
33716 unsigned NumElts = VT.getVectorNumElements() * Factor;
33717
33718 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33719 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33720 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33721
33722 Src = ExtendToType(Src, VT, DAG);
33723 Index = ExtendToType(Index, IndexVT, DAG);
33724 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33725 }
33726
33727 SDVTList VTs = DAG.getVTList(MVT::Other);
33728 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33729 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33730 N->getMemoryVT(), N->getMemOperand());
33731}
33732
33733static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33734 SelectionDAG &DAG) {
33735
33736 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
33737 MVT VT = Op.getSimpleValueType();
33738 MVT ScalarVT = VT.getScalarType();
33739 SDValue Mask = N->getMask();
33740 MVT MaskVT = Mask.getSimpleValueType();
33741 SDValue PassThru = N->getPassThru();
33742 SDLoc dl(Op);
33743
33744 // Handle AVX masked loads which don't support passthru other than 0.
33745 if (MaskVT.getVectorElementType() != MVT::i1) {
33746 // We also allow undef in the isel pattern.
33747 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33748 return Op;
33749
33750 SDValue NewLoad = DAG.getMaskedLoad(
33751 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33752 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33753 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33754 N->isExpandingLoad());
33755 // Emit a blend.
33756 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33757 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33758 }
33759
33760 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__
__PRETTY_FUNCTION__))
33761 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__
__PRETTY_FUNCTION__))
;
33762
33763 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__
__PRETTY_FUNCTION__))
33764 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__
__PRETTY_FUNCTION__))
;
33765
33766 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__
__PRETTY_FUNCTION__))
33767 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__
__PRETTY_FUNCTION__))
;
33768
33769 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
33770 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
33771 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
33772 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
;
33773
33774 // This operation is legal for targets with VLX, but without
33775 // VLX the vector should be widened to 512 bit
33776 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33777 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33778 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33779
33780 // Mask element has to be i1.
33781 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__
__PRETTY_FUNCTION__))
33782 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__
__PRETTY_FUNCTION__))
;
33783
33784 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33785
33786 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33787 SDValue NewLoad = DAG.getMaskedLoad(
33788 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33789 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33790 N->getExtensionType(), N->isExpandingLoad());
33791
33792 SDValue Extract =
33793 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33794 DAG.getIntPtrConstant(0, dl));
33795 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33796 return DAG.getMergeValues(RetOps, dl);
33797}
33798
33799static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33800 SelectionDAG &DAG) {
33801 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
33802 SDValue DataToStore = N->getValue();
33803 MVT VT = DataToStore.getSimpleValueType();
33804 MVT ScalarVT = VT.getScalarType();
33805 SDValue Mask = N->getMask();
33806 SDLoc dl(Op);
33807
33808 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__
__PRETTY_FUNCTION__))
33809 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__
__PRETTY_FUNCTION__))
;
33810
33811 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__
__PRETTY_FUNCTION__))
33812 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__
__PRETTY_FUNCTION__))
;
33813
33814 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__
__PRETTY_FUNCTION__))
33815 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__
__PRETTY_FUNCTION__))
;
33816
33817 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
33818 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
33819 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
33820 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))
;
33821
33822 // This operation is legal for targets with VLX, but without
33823 // VLX the vector should be widened to 512 bit
33824 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33825 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33826
33827 // Mask element has to be i1.
33828 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__
__PRETTY_FUNCTION__))
33829 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__
__PRETTY_FUNCTION__))
;
33830
33831 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33832
33833 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33834 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33835 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33836 N->getOffset(), Mask, N->getMemoryVT(),
33837 N->getMemOperand(), N->getAddressingMode(),
33838 N->isTruncatingStore(), N->isCompressingStore());
33839}
33840
33841static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33842 SelectionDAG &DAG) {
33843 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__
__PRETTY_FUNCTION__))
33844 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__
__PRETTY_FUNCTION__))
;
33845
33846 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33847 SDLoc dl(Op);
33848 MVT VT = Op.getSimpleValueType();
33849 SDValue Index = N->getIndex();
33850 SDValue Mask = N->getMask();
33851 SDValue PassThru = N->getPassThru();
33852 MVT IndexVT = Index.getSimpleValueType();
33853
33854 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33854, __extension__
__PRETTY_FUNCTION__))
;
33855
33856 // If the index is v2i32, we're being called by type legalization.
33857 if (IndexVT == MVT::v2i32)
33858 return SDValue();
33859
33860 // If we don't have VLX and neither the passthru or index is 512-bits, we
33861 // need to widen until one is.
33862 MVT OrigVT = VT;
33863 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33864 !IndexVT.is512BitVector()) {
33865 // Determine how much we need to widen by to get a 512-bit type.
33866 unsigned Factor = std::min(512/VT.getSizeInBits(),
33867 512/IndexVT.getSizeInBits());
33868
33869 unsigned NumElts = VT.getVectorNumElements() * Factor;
33870
33871 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33872 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33873 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33874
33875 PassThru = ExtendToType(PassThru, VT, DAG);
33876 Index = ExtendToType(Index, IndexVT, DAG);
33877 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33878 }
33879
33880 // Break dependency on the data register.
33881 if (PassThru.isUndef())
33882 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33883
33884 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33885 N->getScale() };
33886 SDValue NewGather = DAG.getMemIntrinsicNode(
33887 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33888 N->getMemOperand());
33889 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
33890 NewGather, DAG.getIntPtrConstant(0, dl));
33891 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33892}
33893
33894static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
33895 SDLoc dl(Op);
33896 SDValue Src = Op.getOperand(0);
33897 MVT DstVT = Op.getSimpleValueType();
33898
33899 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33900 unsigned SrcAS = N->getSrcAddressSpace();
33901
33902 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__
__PRETTY_FUNCTION__))
33903 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__
__PRETTY_FUNCTION__))
;
33904
33905 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33906 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33907 } else if (DstVT == MVT::i64) {
33908 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33909 } else if (DstVT == MVT::i32) {
33910 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33911 } else {
33912 report_fatal_error("Bad address space in addrspacecast");
33913 }
33914 return Op;
33915}
33916
33917SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33918 SelectionDAG &DAG) const {
33919 // TODO: Eventually, the lowering of these nodes should be informed by or
33920 // deferred to the GC strategy for the function in which they appear. For
33921 // now, however, they must be lowered to something. Since they are logically
33922 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33923 // require special handling for these nodes), lower them as literal NOOPs for
33924 // the time being.
33925 SmallVector<SDValue, 2> Ops;
33926 Ops.push_back(Op.getOperand(0));
33927 if (Op->getGluedNode())
33928 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33929
33930 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33931 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33932}
33933
33934// Custom split CVTPS2PH with wide types.
33935static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
33936 SDLoc dl(Op);
33937 EVT VT = Op.getValueType();
33938 SDValue Lo, Hi;
33939 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33940 EVT LoVT, HiVT;
33941 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33942 SDValue RC = Op.getOperand(1);
33943 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33944 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33945 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33946}
33947
33948static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
33949 unsigned OpNo) {
33950 const APInt Operand(32, OpNo);
33951 std::string OpNoStr = llvm::toString(Operand, 10, false);
33952 std::string Str(" $");
33953
33954 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33955 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33956
33957 auto I = StringRef::npos;
33958 for (auto &AsmStr : AsmStrs) {
33959 // Match the OpNo string. We should match exactly to exclude match
33960 // sub-string, e.g. "$12" contain "$1"
33961 if (AsmStr.endswith(OpNoStr1))
33962 I = AsmStr.size() - OpNoStr1.size();
33963
33964 // Get the index of operand in AsmStr.
33965 if (I == StringRef::npos)
33966 I = AsmStr.find(OpNoStr1 + ",");
33967 if (I == StringRef::npos)
33968 I = AsmStr.find(OpNoStr2);
33969
33970 if (I == StringRef::npos)
33971 continue;
33972
33973 assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33973, __extension__
__PRETTY_FUNCTION__))
;
33974 // Remove the operand string and label (if exsit).
33975 // For example:
33976 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33977 // ==>
33978 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33979 // ==>
33980 // "call dword ptr "
33981 auto TmpStr = AsmStr.substr(0, I);
33982 I = TmpStr.rfind(':');
33983 if (I != StringRef::npos)
33984 TmpStr = TmpStr.substr(I + 1);
33985 return TmpStr.take_while(llvm::isAlpha);
33986 }
33987
33988 return StringRef();
33989}
33990
33991bool X86TargetLowering::isInlineAsmTargetBranch(
33992 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33993 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33994 // changed from indirect TargetLowering::C_Memory to direct
33995 // TargetLowering::C_Address.
33996 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33997 // location.
33998 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33999 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
34000}
34001
34002/// Provide custom lowering hooks for some operations.
34003SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
34004 switch (Op.getOpcode()) {
34005 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34005)
;
34006 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
34007 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
34008 return LowerCMP_SWAP(Op, Subtarget, DAG);
34009 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
34010 case ISD::ATOMIC_LOAD_ADD:
34011 case ISD::ATOMIC_LOAD_SUB:
34012 case ISD::ATOMIC_LOAD_OR:
34013 case ISD::ATOMIC_LOAD_XOR:
34014 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
34015 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
34016 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
34017 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
34018 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
34019 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
34020 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
34021 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
34022 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
34023 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
34024 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
34025 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
34026 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
34027 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
34028 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
34029 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
34030 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
34031 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
34032 case ISD::SHL_PARTS:
34033 case ISD::SRA_PARTS:
34034 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
34035 case ISD::FSHL:
34036 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
34037 case ISD::STRICT_SINT_TO_FP:
34038 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
34039 case ISD::STRICT_UINT_TO_FP:
34040 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
34041 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
34042 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
34043 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
34044 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
34045 case ISD::ZERO_EXTEND_VECTOR_INREG:
34046 case ISD::SIGN_EXTEND_VECTOR_INREG:
34047 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
34048 case ISD::FP_TO_SINT:
34049 case ISD::STRICT_FP_TO_SINT:
34050 case ISD::FP_TO_UINT:
34051 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
34052 case ISD::FP_TO_SINT_SAT:
34053 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
34054 case ISD::FP_EXTEND:
34055 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
34056 case ISD::FP_ROUND:
34057 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
34058 case ISD::FP16_TO_FP:
34059 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
34060 case ISD::FP_TO_FP16:
34061 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
34062 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
34063 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
34064 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
34065 case ISD::FADD:
34066 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
34067 case ISD::FROUND: return LowerFROUND(Op, DAG);
34068 case ISD::FABS:
34069 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
34070 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
34071 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
34072 case ISD::LRINT:
34073 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
34074 case ISD::SETCC:
34075 case ISD::STRICT_FSETCC:
34076 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
34077 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
34078 case ISD::SELECT: return LowerSELECT(Op, DAG);
34079 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
34080 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
34081 case ISD::VASTART: return LowerVASTART(Op, DAG);
34082 case ISD::VAARG: return LowerVAARG(Op, DAG);
34083 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
34084 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
34085 case ISD::INTRINSIC_VOID:
34086 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
34087 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
34088 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
34089 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
34090 case ISD::FRAME_TO_ARGS_OFFSET:
34091 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
34092 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
34093 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
34094 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
34095 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
34096 case ISD::EH_SJLJ_SETUP_DISPATCH:
34097 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
34098 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
34099 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
34100 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
34101 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
34102 case ISD::CTLZ:
34103 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
34104 case ISD::CTTZ:
34105 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
34106 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
34107 case ISD::MULHS:
34108 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
34109 case ISD::ROTL:
34110 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
34111 case ISD::SRA:
34112 case ISD::SRL:
34113 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
34114 case ISD::SADDO:
34115 case ISD::UADDO:
34116 case ISD::SSUBO:
34117 case ISD::USUBO: return LowerXALUO(Op, DAG);
34118 case ISD::SMULO:
34119 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
34120 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
34121 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
34122 case ISD::SADDO_CARRY:
34123 case ISD::SSUBO_CARRY:
34124 case ISD::UADDO_CARRY:
34125 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
34126 case ISD::ADD:
34127 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
34128 case ISD::UADDSAT:
34129 case ISD::SADDSAT:
34130 case ISD::USUBSAT:
34131 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
34132 case ISD::SMAX:
34133 case ISD::SMIN:
34134 case ISD::UMAX:
34135 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
34136 case ISD::FMINIMUM:
34137 case ISD::FMAXIMUM:
34138 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
34139 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
34140 case ISD::ABDS:
34141 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
34142 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
34143 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
34144 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
34145 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
34146 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
34147 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
34148 case ISD::GC_TRANSITION_START:
34149 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
34150 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
34151 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
34152 }
34153}
34154
34155/// Replace a node with an illegal result type with a new node built out of
34156/// custom code.
34157void X86TargetLowering::ReplaceNodeResults(SDNode *N,
34158 SmallVectorImpl<SDValue>&Results,
34159 SelectionDAG &DAG) const {
34160 SDLoc dl(N);
34161 switch (N->getOpcode()) {
34162 default:
34163#ifndef NDEBUG
34164 dbgs() << "ReplaceNodeResults: ";
34165 N->dump(&DAG);
34166#endif
34167 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34167)
;
34168 case X86ISD::CVTPH2PS: {
34169 EVT VT = N->getValueType(0);
34170 SDValue Lo, Hi;
34171 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34172 EVT LoVT, HiVT;
34173 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
34174 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
34175 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
34176 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34177 Results.push_back(Res);
34178 return;
34179 }
34180 case X86ISD::STRICT_CVTPH2PS: {
34181 EVT VT = N->getValueType(0);
34182 SDValue Lo, Hi;
34183 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
34184 EVT LoVT, HiVT;
34185 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
34186 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
34187 {N->getOperand(0), Lo});
34188 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
34189 {N->getOperand(0), Hi});
34190 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34191 Lo.getValue(1), Hi.getValue(1));
34192 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34193 Results.push_back(Res);
34194 Results.push_back(Chain);
34195 return;
34196 }
34197 case X86ISD::CVTPS2PH:
34198 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
34199 return;
34200 case ISD::CTPOP: {
34201 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34201, __extension__
__PRETTY_FUNCTION__))
;
34202 // Use a v2i64 if possible.
34203 bool NoImplicitFloatOps =
34204 DAG.getMachineFunction().getFunction().hasFnAttribute(
34205 Attribute::NoImplicitFloat);
34206 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
34207 SDValue Wide =
34208 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
34209 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
34210 // Bit count should fit in 32-bits, extract it as that and then zero
34211 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
34212 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
34213 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
34214 DAG.getIntPtrConstant(0, dl));
34215 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
34216 Results.push_back(Wide);
34217 }
34218 return;
34219 }
34220 case ISD::MUL: {
34221 EVT VT = N->getValueType(0);
34222 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__
__PRETTY_FUNCTION__))
34223 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__
__PRETTY_FUNCTION__))
;
34224 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
34225 // elements are needed.
34226 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
34227 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
34228 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
34229 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
34230 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34231 unsigned NumConcats = 16 / VT.getVectorNumElements();
34232 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34233 ConcatOps[0] = Res;
34234 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
34235 Results.push_back(Res);
34236 return;
34237 }
34238 case ISD::SMULO:
34239 case ISD::UMULO: {
34240 EVT VT = N->getValueType(0);
34241 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__))
34242 VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__))
;
34243 bool IsSigned = N->getOpcode() == ISD::SMULO;
34244 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
34245 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
34246 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
34247 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
34248 // Extract the high 32 bits from each result using PSHUFD.
34249 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
34250 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
34251 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
34252 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
34253 DAG.getIntPtrConstant(0, dl));
34254
34255 // Truncate the low bits of the result. This will become PSHUFD.
34256 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34257
34258 SDValue HiCmp;
34259 if (IsSigned) {
34260 // SMULO overflows if the high bits don't match the sign of the low.
34261 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
34262 } else {
34263 // UMULO overflows if the high bits are non-zero.
34264 HiCmp = DAG.getConstant(0, dl, VT);
34265 }
34266 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
34267
34268 // Widen the result with by padding with undef.
34269 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34270 DAG.getUNDEF(VT));
34271 Results.push_back(Res);
34272 Results.push_back(Ovf);
34273 return;
34274 }
34275 case X86ISD::VPMADDWD: {
34276 // Legalize types for X86ISD::VPMADDWD by widening.
34277 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34277, __extension__
__PRETTY_FUNCTION__))
;
34278
34279 EVT VT = N->getValueType(0);
34280 EVT InVT = N->getOperand(0).getValueType();
34281 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__
__PRETTY_FUNCTION__))
34282 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__
__PRETTY_FUNCTION__))
;
34283 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__
__PRETTY_FUNCTION__))
34284 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__
__PRETTY_FUNCTION__))
;
34285 unsigned NumConcat = 128 / InVT.getSizeInBits();
34286
34287 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
34288 InVT.getVectorElementType(),
34289 NumConcat * InVT.getVectorNumElements());
34290 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
34291 VT.getVectorElementType(),
34292 NumConcat * VT.getVectorNumElements());
34293
34294 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
34295 Ops[0] = N->getOperand(0);
34296 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34297 Ops[0] = N->getOperand(1);
34298 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34299
34300 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
34301 Results.push_back(Res);
34302 return;
34303 }
34304 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
34305 case X86ISD::FMINC:
34306 case X86ISD::FMIN:
34307 case X86ISD::FMAXC:
34308 case X86ISD::FMAX: {
34309 EVT VT = N->getValueType(0);
34310 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__
__PRETTY_FUNCTION__))
;
34311 SDValue UNDEF = DAG.getUNDEF(VT);
34312 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34313 N->getOperand(0), UNDEF);
34314 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34315 N->getOperand(1), UNDEF);
34316 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
34317 return;
34318 }
34319 case ISD::SDIV:
34320 case ISD::UDIV:
34321 case ISD::SREM:
34322 case ISD::UREM: {
34323 EVT VT = N->getValueType(0);
34324 if (VT.isVector()) {
34325 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__
__PRETTY_FUNCTION__))
34326 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__
__PRETTY_FUNCTION__))
;
34327 // If this RHS is a constant splat vector we can widen this and let
34328 // division/remainder by constant optimize it.
34329 // TODO: Can we do something for non-splat?
34330 APInt SplatVal;
34331 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
34332 unsigned NumConcats = 128 / VT.getSizeInBits();
34333 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
34334 Ops0[0] = N->getOperand(0);
34335 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
34336 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
34337 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
34338 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
34339 Results.push_back(Res);
34340 }
34341 return;
34342 }
34343
34344 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
34345 Results.push_back(V);
34346 return;
34347 }
34348 case ISD::TRUNCATE: {
34349 MVT VT = N->getSimpleValueType(0);
34350 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
34351 return;
34352
34353 // The generic legalizer will try to widen the input type to the same
34354 // number of elements as the widened result type. But this isn't always
34355 // the best thing so do some custom legalization to avoid some cases.
34356 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
34357 SDValue In = N->getOperand(0);
34358 EVT InVT = In.getValueType();
34359
34360 unsigned InBits = InVT.getSizeInBits();
34361 if (128 % InBits == 0) {
34362 // 128 bit and smaller inputs should avoid truncate all together and
34363 // just use a build_vector that will become a shuffle.
34364 // TODO: Widen and use a shuffle directly?
34365 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
34366 EVT EltVT = VT.getVectorElementType();
34367 unsigned WidenNumElts = WidenVT.getVectorNumElements();
34368 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
34369 // Use the original element count so we don't do more scalar opts than
34370 // necessary.
34371 unsigned MinElts = VT.getVectorNumElements();
34372 for (unsigned i=0; i < MinElts; ++i) {
34373 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
34374 DAG.getIntPtrConstant(i, dl));
34375 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
34376 }
34377 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
34378 return;
34379 }
34380 // With AVX512 there are some cases that can use a target specific
34381 // truncate node to go from 256/512 to less than 128 with zeros in the
34382 // upper elements of the 128 bit result.
34383 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34384 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34385 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34386 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34387 return;
34388 }
34389 // There's one case we can widen to 512 bits and use VTRUNC.
34390 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34391 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34392 DAG.getUNDEF(MVT::v4i64));
34393 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34394 return;
34395 }
34396 }
34397 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34398 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34399 isTypeLegal(MVT::v4i64)) {
34400 // Input needs to be split and output needs to widened. Let's use two
34401 // VTRUNCs, and shuffle their results together into the wider type.
34402 SDValue Lo, Hi;
34403 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34404
34405 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34406 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34407 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34408 { 0, 1, 2, 3, 16, 17, 18, 19,
34409 -1, -1, -1, -1, -1, -1, -1, -1 });
34410 Results.push_back(Res);
34411 return;
34412 }
34413
34414 return;
34415 }
34416 case ISD::ANY_EXTEND:
34417 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34418 // It's intended to custom handle the input type.
34419 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__
__PRETTY_FUNCTION__))
34420 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__
__PRETTY_FUNCTION__))
;
34421 return;
34422 case ISD::SIGN_EXTEND:
34423 case ISD::ZERO_EXTEND: {
34424 EVT VT = N->getValueType(0);
34425 SDValue In = N->getOperand(0);
34426 EVT InVT = In.getValueType();
34427 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34428 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34429 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__
__PRETTY_FUNCTION__))
34430 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__
__PRETTY_FUNCTION__))
;
34431 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34431, __extension__
__PRETTY_FUNCTION__))
;
34432 // Custom split this so we can extend i8/i16->i32 invec. This is better
34433 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34434 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34435 // we allow the sra from the extend to i32 to be shared by the split.
34436 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34437
34438 // Fill a vector with sign bits for each element.
34439 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34440 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34441
34442 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34443 // to v2i64.
34444 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34445 {0, 4, 1, 5});
34446 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34447 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34448 {2, 6, 3, 7});
34449 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34450
34451 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34452 Results.push_back(Res);
34453 return;
34454 }
34455
34456 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34457 if (!InVT.is128BitVector()) {
34458 // Not a 128 bit vector, but maybe type legalization will promote
34459 // it to 128 bits.
34460 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34461 return;
34462 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34463 if (!InVT.is128BitVector())
34464 return;
34465
34466 // Promote the input to 128 bits. Type legalization will turn this into
34467 // zext_inreg/sext_inreg.
34468 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
34469 }
34470
34471 // Perform custom splitting instead of the two stage extend we would get
34472 // by default.
34473 EVT LoVT, HiVT;
34474 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34475 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34475, __extension__
__PRETTY_FUNCTION__))
;
34476
34477 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
34478
34479 // We need to shift the input over by half the number of elements.
34480 unsigned NumElts = InVT.getVectorNumElements();
34481 unsigned HalfNumElts = NumElts / 2;
34482 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34483 for (unsigned i = 0; i != HalfNumElts; ++i)
34484 ShufMask[i] = i + HalfNumElts;
34485
34486 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34487 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
34488
34489 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34490 Results.push_back(Res);
34491 }
34492 return;
34493 }
34494 case ISD::FP_TO_SINT:
34495 case ISD::STRICT_FP_TO_SINT:
34496 case ISD::FP_TO_UINT:
34497 case ISD::STRICT_FP_TO_UINT: {
34498 bool IsStrict = N->isStrictFPOpcode();
34499 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
34500 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
34501 EVT VT = N->getValueType(0);
34502 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34503 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34504 EVT SrcVT = Src.getValueType();
34505
34506 SDValue Res;
34507 if (isSoftFP16(SrcVT)) {
34508 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34509 if (IsStrict) {
34510 Res =
34511 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
34512 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34513 {NVT, MVT::Other}, {Chain, Src})});
34514 Chain = Res.getValue(1);
34515 } else {
34516 Res = DAG.getNode(N->getOpcode(), dl, VT,
34517 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34518 }
34519 Results.push_back(Res);
34520 if (IsStrict)
34521 Results.push_back(Chain);
34522
34523 return;
34524 }
34525
34526 if (VT.isVector() && Subtarget.hasFP16() &&
34527 SrcVT.getVectorElementType() == MVT::f16) {
34528 EVT EleVT = VT.getVectorElementType();
34529 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34530
34531 if (SrcVT != MVT::v8f16) {
34532 SDValue Tmp =
34533 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34534 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34535 Ops[0] = Src;
34536 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34537 }
34538
34539 if (IsStrict) {
34540 unsigned Opc =
34541 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34542 Res =
34543 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34544 Chain = Res.getValue(1);
34545 } else {
34546 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34547 Res = DAG.getNode(Opc, dl, ResVT, Src);
34548 }
34549
34550 // TODO: Need to add exception check code for strict FP.
34551 if (EleVT.getSizeInBits() < 16) {
34552 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34553 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34554
34555 // Now widen to 128 bits.
34556 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34557 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34558 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34559 ConcatOps[0] = Res;
34560 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34561 }
34562
34563 Results.push_back(Res);
34564 if (IsStrict)
34565 Results.push_back(Chain);
34566
34567 return;
34568 }
34569
34570 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34571 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__
__PRETTY_FUNCTION__))
34572 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__
__PRETTY_FUNCTION__))
;
34573
34574 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34575 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34576 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34577 VT.getVectorNumElements());
34578 SDValue Res;
34579 SDValue Chain;
34580 if (IsStrict) {
34581 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34582 {N->getOperand(0), Src});
34583 Chain = Res.getValue(1);
34584 } else
34585 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34586
34587 // Preserve what we know about the size of the original result. If the
34588 // result is v2i32, we have to manually widen the assert.
34589 if (PromoteVT == MVT::v2i32)
34590 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34591 DAG.getUNDEF(MVT::v2i32));
34592
34593 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34594 Res.getValueType(), Res,
34595 DAG.getValueType(VT.getVectorElementType()));
34596
34597 if (PromoteVT == MVT::v2i32)
34598 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34599 DAG.getIntPtrConstant(0, dl));
34600
34601 // Truncate back to the original width.
34602 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34603
34604 // Now widen to 128 bits.
34605 unsigned NumConcats = 128 / VT.getSizeInBits();
34606 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
34607 VT.getVectorNumElements() * NumConcats);
34608 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34609 ConcatOps[0] = Res;
34610 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34611 Results.push_back(Res);
34612 if (IsStrict)
34613 Results.push_back(Chain);
34614 return;
34615 }
34616
34617
34618 if (VT == MVT::v2i32) {
34619 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__
__PRETTY_FUNCTION__))
34620 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__
__PRETTY_FUNCTION__))
;
34621 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34621, __extension__
__PRETTY_FUNCTION__))
;
34622 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__
__PRETTY_FUNCTION__))
34623 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__
__PRETTY_FUNCTION__))
;
34624 if (Src.getValueType() == MVT::v2f64) {
34625 if (!IsSigned && !Subtarget.hasAVX512()) {
34626 SDValue Res =
34627 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34628 Results.push_back(Res);
34629 return;
34630 }
34631
34632 unsigned Opc;
34633 if (IsStrict)
34634 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34635 else
34636 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34637
34638 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34639 if (!IsSigned && !Subtarget.hasVLX()) {
34640 // Otherwise we can defer to the generic legalizer which will widen
34641 // the input as well. This will be further widened during op
34642 // legalization to v8i32<-v8f64.
34643 // For strict nodes we'll need to widen ourselves.
34644 // FIXME: Fix the type legalizer to safely widen strict nodes?
34645 if (!IsStrict)
34646 return;
34647 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34648 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34649 Opc = N->getOpcode();
34650 }
34651 SDValue Res;
34652 SDValue Chain;
34653 if (IsStrict) {
34654 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34655 {N->getOperand(0), Src});
34656 Chain = Res.getValue(1);
34657 } else {
34658 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34659 }
34660 Results.push_back(Res);
34661 if (IsStrict)
34662 Results.push_back(Chain);
34663 return;
34664 }
34665
34666 // Custom widen strict v2f32->v2i32 by padding with zeros.
34667 // FIXME: Should generic type legalizer do this?
34668 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34669 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34670 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34671 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
34672 {N->getOperand(0), Src});
34673 Results.push_back(Res);
34674 Results.push_back(Res.getValue(1));
34675 return;
34676 }
34677
34678 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34679 // so early out here.
34680 return;
34681 }
34682
34683 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34683, __extension__
__PRETTY_FUNCTION__))
;
34684
34685 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34686 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34687 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34688 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__
__PRETTY_FUNCTION__))
;
34689 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34690 // If we use a 128-bit result we might need to use a target specific node.
34691 unsigned SrcElts =
34692 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34693 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34694 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34695 unsigned Opc = N->getOpcode();
34696 if (NumElts != SrcElts) {
34697 if (IsStrict)
34698 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34699 else
34700 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34701 }
34702
34703 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
34704 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34705 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34706 ZeroIdx);
34707 SDValue Chain;
34708 if (IsStrict) {
34709 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34710 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34711 Chain = Res.getValue(1);
34712 } else
34713 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34714 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34715 Results.push_back(Res);
34716 if (IsStrict)
34717 Results.push_back(Chain);
34718 return;
34719 }
34720
34721 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34722 SDValue Chain;
34723 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34724 Results.push_back(V);
34725 if (IsStrict)
34726 Results.push_back(Chain);
34727 return;
34728 }
34729
34730 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34731 Results.push_back(V);
34732 if (IsStrict)
34733 Results.push_back(Chain);
34734 }
34735 return;
34736 }
34737 case ISD::LRINT:
34738 case ISD::LLRINT: {
34739 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34740 Results.push_back(V);
34741 return;
34742 }
34743
34744 case ISD::SINT_TO_FP:
34745 case ISD::STRICT_SINT_TO_FP:
34746 case ISD::UINT_TO_FP:
34747 case ISD::STRICT_UINT_TO_FP: {
34748 bool IsStrict = N->isStrictFPOpcode();
34749 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
34750 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
34751 EVT VT = N->getValueType(0);
34752 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34753 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34754 Subtarget.hasVLX()) {
34755 if (Src.getValueType().getVectorElementType() == MVT::i16)
34756 return;
34757
34758 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34759 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34760 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34761 : DAG.getUNDEF(MVT::v2i32));
34762 if (IsStrict) {
34763 unsigned Opc =
34764 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
34765 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34766 {N->getOperand(0), Src});
34767 Results.push_back(Res);
34768 Results.push_back(Res.getValue(1));
34769 } else {
34770 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34771 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34772 }
34773 return;
34774 }
34775 if (VT != MVT::v2f32)
34776 return;
34777 EVT SrcVT = Src.getValueType();
34778 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34779 if (IsStrict) {
34780 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34781 : X86ISD::STRICT_CVTUI2P;
34782 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34783 {N->getOperand(0), Src});
34784 Results.push_back(Res);
34785 Results.push_back(Res.getValue(1));
34786 } else {
34787 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34788 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34789 }
34790 return;
34791 }
34792 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34793 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34794 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34795 SDValue One = DAG.getConstant(1, dl, SrcVT);
34796 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34797 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34798 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34799 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34800 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34801 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34802 for (int i = 0; i != 2; ++i) {
34803 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34804 SignSrc, DAG.getIntPtrConstant(i, dl));
34805 if (IsStrict)
34806 SignCvts[i] =
34807 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34808 {N->getOperand(0), Elt});
34809 else
34810 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34811 };
34812 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34813 SDValue Slow, Chain;
34814 if (IsStrict) {
34815 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34816 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34817 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34818 {Chain, SignCvt, SignCvt});
34819 Chain = Slow.getValue(1);
34820 } else {
34821 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34822 }
34823 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34824 IsNeg =
34825 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34826 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34827 Results.push_back(Cvt);
34828 if (IsStrict)
34829 Results.push_back(Chain);
34830 return;
34831 }
34832
34833 if (SrcVT != MVT::v2i32)
34834 return;
34835
34836 if (IsSigned || Subtarget.hasAVX512()) {
34837 if (!IsStrict)
34838 return;
34839
34840 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34841 // FIXME: Should generic type legalizer do this?
34842 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34843 DAG.getConstant(0, dl, MVT::v2i32));
34844 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
34845 {N->getOperand(0), Src});
34846 Results.push_back(Res);
34847 Results.push_back(Res.getValue(1));
34848 return;
34849 }
34850
34851 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34851, __extension__
__PRETTY_FUNCTION__))
;
34852 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34853 SDValue VBias = DAG.getConstantFP(
34854 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34855 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34856 DAG.getBitcast(MVT::v2i64, VBias));
34857 Or = DAG.getBitcast(MVT::v2f64, Or);
34858 if (IsStrict) {
34859 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34860 {N->getOperand(0), Or, VBias});
34861 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
34862 {MVT::v4f32, MVT::Other},
34863 {Sub.getValue(1), Sub});
34864 Results.push_back(Res);
34865 Results.push_back(Res.getValue(1));
34866 } else {
34867 // TODO: Are there any fast-math-flags to propagate here?
34868 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34869 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34870 }
34871 return;
34872 }
34873 case ISD::STRICT_FP_ROUND:
34874 case ISD::FP_ROUND: {
34875 bool IsStrict = N->isStrictFPOpcode();
34876 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34877 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34878 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34879 EVT SrcVT = Src.getValueType();
34880 EVT VT = N->getValueType(0);
34881 SDValue V;
34882 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34883 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34884 : DAG.getUNDEF(MVT::v2f32);
34885 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34886 }
34887 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34888 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34888, __extension__
__PRETTY_FUNCTION__))
;
34889 if (SrcVT.getVectorElementType() != MVT::f32)
34890 return;
34891
34892 if (IsStrict)
34893 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34894 {Chain, Src, Rnd});
34895 else
34896 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34897
34898 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34899 if (IsStrict)
34900 Results.push_back(V.getValue(1));
34901 return;
34902 }
34903 if (!isTypeLegal(Src.getValueType()))
34904 return;
34905 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34906 if (IsStrict)
34907 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34908 {Chain, Src});
34909 else
34910 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34911 Results.push_back(V);
34912 if (IsStrict)
34913 Results.push_back(V.getValue(1));
34914 return;
34915 }
34916 case ISD::FP_EXTEND:
34917 case ISD::STRICT_FP_EXTEND: {
34918 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34919 // No other ValueType for FP_EXTEND should reach this point.
34920 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__
__PRETTY_FUNCTION__))
34921 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__
__PRETTY_FUNCTION__))
;
34922 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34923 return;
34924 bool IsStrict = N->isStrictFPOpcode();
34925 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34926 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34927 : DAG.getUNDEF(MVT::v2f16);
34928 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34929 if (IsStrict)
34930 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34931 {N->getOperand(0), V});
34932 else
34933 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34934 Results.push_back(V);
34935 if (IsStrict)
34936 Results.push_back(V.getValue(1));
34937 return;
34938 }
34939 case ISD::INTRINSIC_W_CHAIN: {
34940 unsigned IntNo = N->getConstantOperandVal(1);
34941 switch (IntNo) {
34942 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34943)
34943 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34943)
;
34944 case Intrinsic::x86_rdtsc:
34945 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34946 Results);
34947 case Intrinsic::x86_rdtscp:
34948 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34949 Results);
34950 case Intrinsic::x86_rdpmc:
34951 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34952 Results);
34953 return;
34954 case Intrinsic::x86_rdpru:
34955 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34956 Results);
34957 return;
34958 case Intrinsic::x86_xgetbv:
34959 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34960 Results);
34961 return;
34962 }
34963 }
34964 case ISD::READCYCLECOUNTER: {
34965 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34966 }
34967 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34968 EVT T = N->getValueType(0);
34969 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34969, __extension__
__PRETTY_FUNCTION__))
;
34970 bool Regs64bit = T == MVT::i128;
34971 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__
__PRETTY_FUNCTION__))
34972 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__
__PRETTY_FUNCTION__))
;
34973 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34974 SDValue cpInL, cpInH;
34975 std::tie(cpInL, cpInH) =
34976 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34977 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34978 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34979 cpInH =
34980 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34981 cpInH, cpInL.getValue(1));
34982 SDValue swapInL, swapInH;
34983 std::tie(swapInL, swapInH) =
34984 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34985 swapInH =
34986 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34987 swapInH, cpInH.getValue(1));
34988
34989 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34990 // until later. So we keep the RBX input in a vreg and use a custom
34991 // inserter.
34992 // Since RBX will be a reserved register the register allocator will not
34993 // make sure its value will be properly saved and restored around this
34994 // live-range.
34995 SDValue Result;
34996 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34997 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34998 if (Regs64bit) {
34999 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
35000 swapInH.getValue(1)};
35001 Result =
35002 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
35003 } else {
35004 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
35005 swapInH.getValue(1));
35006 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
35007 swapInL.getValue(1)};
35008 Result =
35009 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
35010 }
35011
35012 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
35013 Regs64bit ? X86::RAX : X86::EAX,
35014 HalfT, Result.getValue(1));
35015 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
35016 Regs64bit ? X86::RDX : X86::EDX,
35017 HalfT, cpOutL.getValue(2));
35018 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
35019
35020 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
35021 MVT::i32, cpOutH.getValue(2));
35022 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
35023 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
35024
35025 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
35026 Results.push_back(Success);
35027 Results.push_back(EFLAGS.getValue(1));
35028 return;
35029 }
35030 case ISD::ATOMIC_LOAD: {
35031 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35031, __extension__
__PRETTY_FUNCTION__))
;
35032 bool NoImplicitFloatOps =
35033 DAG.getMachineFunction().getFunction().hasFnAttribute(
35034 Attribute::NoImplicitFloat);
35035 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
35036 auto *Node = cast<AtomicSDNode>(N);
35037 if (Subtarget.hasSSE1()) {
35038 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
35039 // Then extract the lower 64-bits.
35040 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
35041 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
35042 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
35043 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35044 MVT::i64, Node->getMemOperand());
35045 if (Subtarget.hasSSE2()) {
35046 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
35047 DAG.getIntPtrConstant(0, dl));
35048 Results.push_back(Res);
35049 Results.push_back(Ld.getValue(1));
35050 return;
35051 }
35052 // We use an alternative sequence for SSE1 that extracts as v2f32 and
35053 // then casts to i64. This avoids a 128-bit stack temporary being
35054 // created by type legalization if we were to cast v4f32->v2i64.
35055 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
35056 DAG.getIntPtrConstant(0, dl));
35057 Res = DAG.getBitcast(MVT::i64, Res);
35058 Results.push_back(Res);
35059 Results.push_back(Ld.getValue(1));
35060 return;
35061 }
35062 if (Subtarget.hasX87()) {
35063 // First load this into an 80-bit X87 register. This will put the whole
35064 // integer into the significand.
35065 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
35066 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
35067 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
35068 dl, Tys, Ops, MVT::i64,
35069 Node->getMemOperand());
35070 SDValue Chain = Result.getValue(1);
35071
35072 // Now store the X87 register to a stack temporary and convert to i64.
35073 // This store is not atomic and doesn't need to be.
35074 // FIXME: We don't need a stack temporary if the result of the load
35075 // is already being stored. We could just directly store there.
35076 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
35077 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
35078 MachinePointerInfo MPI =
35079 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
35080 SDValue StoreOps[] = { Chain, Result, StackPtr };
35081 Chain = DAG.getMemIntrinsicNode(
35082 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
35083 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
35084
35085 // Finally load the value back from the stack temporary and return it.
35086 // This load is not atomic and doesn't need to be.
35087 // This load will be further type legalized.
35088 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
35089 Results.push_back(Result);
35090 Results.push_back(Result.getValue(1));
35091 return;
35092 }
35093 }
35094 // TODO: Use MOVLPS when SSE1 is available?
35095 // Delegate to generic TypeLegalization. Situations we can really handle
35096 // should have already been dealt with by AtomicExpandPass.cpp.
35097 break;
35098 }
35099 case ISD::ATOMIC_SWAP:
35100 case ISD::ATOMIC_LOAD_ADD:
35101 case ISD::ATOMIC_LOAD_SUB:
35102 case ISD::ATOMIC_LOAD_AND:
35103 case ISD::ATOMIC_LOAD_OR:
35104 case ISD::ATOMIC_LOAD_XOR:
35105 case ISD::ATOMIC_LOAD_NAND:
35106 case ISD::ATOMIC_LOAD_MIN:
35107 case ISD::ATOMIC_LOAD_MAX:
35108 case ISD::ATOMIC_LOAD_UMIN:
35109 case ISD::ATOMIC_LOAD_UMAX:
35110 // Delegate to generic TypeLegalization. Situations we can really handle
35111 // should have already been dealt with by AtomicExpandPass.cpp.
35112 break;
35113
35114 case ISD::BITCAST: {
35115 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35115, __extension__
__PRETTY_FUNCTION__))
;
35116 EVT DstVT = N->getValueType(0);
35117 EVT SrcVT = N->getOperand(0).getValueType();
35118
35119 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
35120 // we can split using the k-register rather than memory.
35121 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
35122 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35122, __extension__
__PRETTY_FUNCTION__))
;
35123 SDValue Lo, Hi;
35124 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
35125 Lo = DAG.getBitcast(MVT::i32, Lo);
35126 Hi = DAG.getBitcast(MVT::i32, Hi);
35127 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
35128 Results.push_back(Res);
35129 return;
35130 }
35131
35132 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
35133 // FIXME: Use v4f32 for SSE1?
35134 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35134, __extension__
__PRETTY_FUNCTION__))
;
35135 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__
__PRETTY_FUNCTION__))
35136 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__
__PRETTY_FUNCTION__))
;
35137 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
35138 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
35139 N->getOperand(0));
35140 Res = DAG.getBitcast(WideVT, Res);
35141 Results.push_back(Res);
35142 return;
35143 }
35144
35145 return;
35146 }
35147 case ISD::MGATHER: {
35148 EVT VT = N->getValueType(0);
35149 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
35150 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
35151 auto *Gather = cast<MaskedGatherSDNode>(N);
35152 SDValue Index = Gather->getIndex();
35153 if (Index.getValueType() != MVT::v2i64)
35154 return;
35155 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__
__PRETTY_FUNCTION__))
35156 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__
__PRETTY_FUNCTION__))
;
35157 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
35158 SDValue Mask = Gather->getMask();
35159 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35159, __extension__
__PRETTY_FUNCTION__))
;
35160 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
35161 Gather->getPassThru(),
35162 DAG.getUNDEF(VT));
35163 if (!Subtarget.hasVLX()) {
35164 // We need to widen the mask, but the instruction will only use 2
35165 // of its elements. So we can use undef.
35166 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
35167 DAG.getUNDEF(MVT::v2i1));
35168 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
35169 }
35170 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
35171 Gather->getBasePtr(), Index, Gather->getScale() };
35172 SDValue Res = DAG.getMemIntrinsicNode(
35173 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
35174 Gather->getMemoryVT(), Gather->getMemOperand());
35175 Results.push_back(Res);
35176 Results.push_back(Res.getValue(1));
35177 return;
35178 }
35179 return;
35180 }
35181 case ISD::LOAD: {
35182 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
35183 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
35184 // cast since type legalization will try to use an i64 load.
35185 MVT VT = N->getSimpleValueType(0);
35186 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35186, __extension__
__PRETTY_FUNCTION__))
;
35187 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__
__PRETTY_FUNCTION__))
35188 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__
__PRETTY_FUNCTION__))
;
35189 if (!ISD::isNON_EXTLoad(N))
35190 return;
35191 auto *Ld = cast<LoadSDNode>(N);
35192 if (Subtarget.hasSSE2()) {
35193 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
35194 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
35195 Ld->getPointerInfo(), Ld->getOriginalAlign(),
35196 Ld->getMemOperand()->getFlags());
35197 SDValue Chain = Res.getValue(1);
35198 MVT VecVT = MVT::getVectorVT(LdVT, 2);
35199 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
35200 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
35201 Res = DAG.getBitcast(WideVT, Res);
35202 Results.push_back(Res);
35203 Results.push_back(Chain);
35204 return;
35205 }
35206 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35206, __extension__
__PRETTY_FUNCTION__))
;
35207 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
35208 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
35209 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35210 MVT::i64, Ld->getMemOperand());
35211 Results.push_back(Res);
35212 Results.push_back(Res.getValue(1));
35213 return;
35214 }
35215 case ISD::ADDRSPACECAST: {
35216 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
35217 Results.push_back(V);
35218 return;
35219 }
35220 case ISD::BITREVERSE: {
35221 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35221, __extension__
__PRETTY_FUNCTION__))
;
35222 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35222, __extension__
__PRETTY_FUNCTION__))
;
35223 // We can use VPPERM by copying to a vector register and back. We'll need
35224 // to move the scalar in two i32 pieces.
35225 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
35226 return;
35227 }
35228 case ISD::EXTRACT_VECTOR_ELT: {
35229 // f16 = extract vXf16 %vec, i64 %idx
35230 assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__
__PRETTY_FUNCTION__))
35231 "Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__
__PRETTY_FUNCTION__))
;
35232 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35232, __extension__
__PRETTY_FUNCTION__))
;
35233 SDValue VecOp = N->getOperand(0);
35234 EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
35235 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
35236 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
35237 N->getOperand(1));
35238 Split = DAG.getBitcast(MVT::f16, Split);
35239 Results.push_back(Split);
35240 return;
35241 }
35242 }
35243}
35244
35245const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
35246 switch ((X86ISD::NodeType)Opcode) {
35247 case X86ISD::FIRST_NUMBER: break;
35248#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
35249 NODE_NAME_CASE(BSF)
35250 NODE_NAME_CASE(BSR)
35251 NODE_NAME_CASE(FSHL)
35252 NODE_NAME_CASE(FSHR)
35253 NODE_NAME_CASE(FAND)
35254 NODE_NAME_CASE(FANDN)
35255 NODE_NAME_CASE(FOR)
35256 NODE_NAME_CASE(FXOR)
35257 NODE_NAME_CASE(FILD)
35258 NODE_NAME_CASE(FIST)
35259 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
35260 NODE_NAME_CASE(FLD)
35261 NODE_NAME_CASE(FST)
35262 NODE_NAME_CASE(CALL)
35263 NODE_NAME_CASE(CALL_RVMARKER)
35264 NODE_NAME_CASE(BT)
35265 NODE_NAME_CASE(CMP)
35266 NODE_NAME_CASE(FCMP)
35267 NODE_NAME_CASE(STRICT_FCMP)
35268 NODE_NAME_CASE(STRICT_FCMPS)
35269 NODE_NAME_CASE(COMI)
35270 NODE_NAME_CASE(UCOMI)
35271 NODE_NAME_CASE(CMPM)
35272 NODE_NAME_CASE(CMPMM)
35273 NODE_NAME_CASE(STRICT_CMPM)
35274 NODE_NAME_CASE(CMPMM_SAE)
35275 NODE_NAME_CASE(SETCC)
35276 NODE_NAME_CASE(SETCC_CARRY)
35277 NODE_NAME_CASE(FSETCC)
35278 NODE_NAME_CASE(FSETCCM)
35279 NODE_NAME_CASE(FSETCCM_SAE)
35280 NODE_NAME_CASE(CMOV)
35281 NODE_NAME_CASE(BRCOND)
35282 NODE_NAME_CASE(RET_GLUE)
35283 NODE_NAME_CASE(IRET)
35284 NODE_NAME_CASE(REP_STOS)
35285 NODE_NAME_CASE(REP_MOVS)
35286 NODE_NAME_CASE(GlobalBaseReg)
35287 NODE_NAME_CASE(Wrapper)
35288 NODE_NAME_CASE(WrapperRIP)
35289 NODE_NAME_CASE(MOVQ2DQ)
35290 NODE_NAME_CASE(MOVDQ2Q)
35291 NODE_NAME_CASE(MMX_MOVD2W)
35292 NODE_NAME_CASE(MMX_MOVW2D)
35293 NODE_NAME_CASE(PEXTRB)
35294 NODE_NAME_CASE(PEXTRW)
35295 NODE_NAME_CASE(INSERTPS)
35296 NODE_NAME_CASE(PINSRB)
35297 NODE_NAME_CASE(PINSRW)
35298 NODE_NAME_CASE(PSHUFB)
35299 NODE_NAME_CASE(ANDNP)
35300 NODE_NAME_CASE(BLENDI)
35301 NODE_NAME_CASE(BLENDV)
35302 NODE_NAME_CASE(HADD)
35303 NODE_NAME_CASE(HSUB)
35304 NODE_NAME_CASE(FHADD)
35305 NODE_NAME_CASE(FHSUB)
35306 NODE_NAME_CASE(CONFLICT)
35307 NODE_NAME_CASE(FMAX)
35308 NODE_NAME_CASE(FMAXS)
35309 NODE_NAME_CASE(FMAX_SAE)
35310 NODE_NAME_CASE(FMAXS_SAE)
35311 NODE_NAME_CASE(FMIN)
35312 NODE_NAME_CASE(FMINS)
35313 NODE_NAME_CASE(FMIN_SAE)
35314 NODE_NAME_CASE(FMINS_SAE)
35315 NODE_NAME_CASE(FMAXC)
35316 NODE_NAME_CASE(FMINC)
35317 NODE_NAME_CASE(FRSQRT)
35318 NODE_NAME_CASE(FRCP)
35319 NODE_NAME_CASE(EXTRQI)
35320 NODE_NAME_CASE(INSERTQI)
35321 NODE_NAME_CASE(TLSADDR)
35322 NODE_NAME_CASE(TLSBASEADDR)
35323 NODE_NAME_CASE(TLSCALL)
35324 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35325 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35326 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35327 NODE_NAME_CASE(EH_RETURN)
35328 NODE_NAME_CASE(TC_RETURN)
35329 NODE_NAME_CASE(FNSTCW16m)
35330 NODE_NAME_CASE(FLDCW16m)
35331 NODE_NAME_CASE(LCMPXCHG_DAG)
35332 NODE_NAME_CASE(LCMPXCHG8_DAG)
35333 NODE_NAME_CASE(LCMPXCHG16_DAG)
35334 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35335 NODE_NAME_CASE(LADD)
35336 NODE_NAME_CASE(LSUB)
35337 NODE_NAME_CASE(LOR)
35338 NODE_NAME_CASE(LXOR)
35339 NODE_NAME_CASE(LAND)
35340 NODE_NAME_CASE(LBTS)
35341 NODE_NAME_CASE(LBTC)
35342 NODE_NAME_CASE(LBTR)
35343 NODE_NAME_CASE(LBTS_RM)
35344 NODE_NAME_CASE(LBTC_RM)
35345 NODE_NAME_CASE(LBTR_RM)
35346 NODE_NAME_CASE(AADD)
35347 NODE_NAME_CASE(AOR)
35348 NODE_NAME_CASE(AXOR)
35349 NODE_NAME_CASE(AAND)
35350 NODE_NAME_CASE(VZEXT_MOVL)
35351 NODE_NAME_CASE(VZEXT_LOAD)
35352 NODE_NAME_CASE(VEXTRACT_STORE)
35353 NODE_NAME_CASE(VTRUNC)
35354 NODE_NAME_CASE(VTRUNCS)
35355 NODE_NAME_CASE(VTRUNCUS)
35356 NODE_NAME_CASE(VMTRUNC)
35357 NODE_NAME_CASE(VMTRUNCS)
35358 NODE_NAME_CASE(VMTRUNCUS)
35359 NODE_NAME_CASE(VTRUNCSTORES)
35360 NODE_NAME_CASE(VTRUNCSTOREUS)
35361 NODE_NAME_CASE(VMTRUNCSTORES)
35362 NODE_NAME_CASE(VMTRUNCSTOREUS)
35363 NODE_NAME_CASE(VFPEXT)
35364 NODE_NAME_CASE(STRICT_VFPEXT)
35365 NODE_NAME_CASE(VFPEXT_SAE)
35366 NODE_NAME_CASE(VFPEXTS)
35367 NODE_NAME_CASE(VFPEXTS_SAE)
35368 NODE_NAME_CASE(VFPROUND)
35369 NODE_NAME_CASE(STRICT_VFPROUND)
35370 NODE_NAME_CASE(VMFPROUND)
35371 NODE_NAME_CASE(VFPROUND_RND)
35372 NODE_NAME_CASE(VFPROUNDS)
35373 NODE_NAME_CASE(VFPROUNDS_RND)
35374 NODE_NAME_CASE(VSHLDQ)
35375 NODE_NAME_CASE(VSRLDQ)
35376 NODE_NAME_CASE(VSHL)
35377 NODE_NAME_CASE(VSRL)
35378 NODE_NAME_CASE(VSRA)
35379 NODE_NAME_CASE(VSHLI)
35380 NODE_NAME_CASE(VSRLI)
35381 NODE_NAME_CASE(VSRAI)
35382 NODE_NAME_CASE(VSHLV)
35383 NODE_NAME_CASE(VSRLV)
35384 NODE_NAME_CASE(VSRAV)
35385 NODE_NAME_CASE(VROTLI)
35386 NODE_NAME_CASE(VROTRI)
35387 NODE_NAME_CASE(VPPERM)
35388 NODE_NAME_CASE(CMPP)
35389 NODE_NAME_CASE(STRICT_CMPP)
35390 NODE_NAME_CASE(PCMPEQ)
35391 NODE_NAME_CASE(PCMPGT)
35392 NODE_NAME_CASE(PHMINPOS)
35393 NODE_NAME_CASE(ADD)
35394 NODE_NAME_CASE(SUB)
35395 NODE_NAME_CASE(ADC)
35396 NODE_NAME_CASE(SBB)
35397 NODE_NAME_CASE(SMUL)
35398 NODE_NAME_CASE(UMUL)
35399 NODE_NAME_CASE(OR)
35400 NODE_NAME_CASE(XOR)
35401 NODE_NAME_CASE(AND)
35402 NODE_NAME_CASE(BEXTR)
35403 NODE_NAME_CASE(BEXTRI)
35404 NODE_NAME_CASE(BZHI)
35405 NODE_NAME_CASE(PDEP)
35406 NODE_NAME_CASE(PEXT)
35407 NODE_NAME_CASE(MUL_IMM)
35408 NODE_NAME_CASE(MOVMSK)
35409 NODE_NAME_CASE(PTEST)
35410 NODE_NAME_CASE(TESTP)
35411 NODE_NAME_CASE(KORTEST)
35412 NODE_NAME_CASE(KTEST)
35413 NODE_NAME_CASE(KADD)
35414 NODE_NAME_CASE(KSHIFTL)
35415 NODE_NAME_CASE(KSHIFTR)
35416 NODE_NAME_CASE(PACKSS)
35417 NODE_NAME_CASE(PACKUS)
35418 NODE_NAME_CASE(PALIGNR)
35419 NODE_NAME_CASE(VALIGN)
35420 NODE_NAME_CASE(VSHLD)
35421 NODE_NAME_CASE(VSHRD)
35422 NODE_NAME_CASE(VSHLDV)
35423 NODE_NAME_CASE(VSHRDV)
35424 NODE_NAME_CASE(PSHUFD)
35425 NODE_NAME_CASE(PSHUFHW)
35426 NODE_NAME_CASE(PSHUFLW)
35427 NODE_NAME_CASE(SHUFP)
35428 NODE_NAME_CASE(SHUF128)
35429 NODE_NAME_CASE(MOVLHPS)
35430 NODE_NAME_CASE(MOVHLPS)
35431 NODE_NAME_CASE(MOVDDUP)
35432 NODE_NAME_CASE(MOVSHDUP)
35433 NODE_NAME_CASE(MOVSLDUP)
35434 NODE_NAME_CASE(MOVSD)
35435 NODE_NAME_CASE(MOVSS)
35436 NODE_NAME_CASE(MOVSH)
35437 NODE_NAME_CASE(UNPCKL)
35438 NODE_NAME_CASE(UNPCKH)
35439 NODE_NAME_CASE(VBROADCAST)
35440 NODE_NAME_CASE(VBROADCAST_LOAD)
35441 NODE_NAME_CASE(VBROADCASTM)
35442 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35443 NODE_NAME_CASE(VPERMILPV)
35444 NODE_NAME_CASE(VPERMILPI)
35445 NODE_NAME_CASE(VPERM2X128)
35446 NODE_NAME_CASE(VPERMV)
35447 NODE_NAME_CASE(VPERMV3)
35448 NODE_NAME_CASE(VPERMI)
35449 NODE_NAME_CASE(VPTERNLOG)
35450 NODE_NAME_CASE(VFIXUPIMM)
35451 NODE_NAME_CASE(VFIXUPIMM_SAE)
35452 NODE_NAME_CASE(VFIXUPIMMS)
35453 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35454 NODE_NAME_CASE(VRANGE)
35455 NODE_NAME_CASE(VRANGE_SAE)
35456 NODE_NAME_CASE(VRANGES)
35457 NODE_NAME_CASE(VRANGES_SAE)
35458 NODE_NAME_CASE(PMULUDQ)
35459 NODE_NAME_CASE(PMULDQ)
35460 NODE_NAME_CASE(PSADBW)
35461 NODE_NAME_CASE(DBPSADBW)
35462 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35463 NODE_NAME_CASE(VAARG_64)
35464 NODE_NAME_CASE(VAARG_X32)
35465 NODE_NAME_CASE(DYN_ALLOCA)
35466 NODE_NAME_CASE(MFENCE)
35467 NODE_NAME_CASE(SEG_ALLOCA)
35468 NODE_NAME_CASE(PROBED_ALLOCA)
35469 NODE_NAME_CASE(RDRAND)
35470 NODE_NAME_CASE(RDSEED)
35471 NODE_NAME_CASE(RDPKRU)
35472 NODE_NAME_CASE(WRPKRU)
35473 NODE_NAME_CASE(VPMADDUBSW)
35474 NODE_NAME_CASE(VPMADDWD)
35475 NODE_NAME_CASE(VPSHA)
35476 NODE_NAME_CASE(VPSHL)
35477 NODE_NAME_CASE(VPCOM)
35478 NODE_NAME_CASE(VPCOMU)
35479 NODE_NAME_CASE(VPERMIL2)
35480 NODE_NAME_CASE(FMSUB)
35481 NODE_NAME_CASE(STRICT_FMSUB)
35482 NODE_NAME_CASE(FNMADD)
35483 NODE_NAME_CASE(STRICT_FNMADD)
35484 NODE_NAME_CASE(FNMSUB)
35485 NODE_NAME_CASE(STRICT_FNMSUB)
35486 NODE_NAME_CASE(FMADDSUB)
35487 NODE_NAME_CASE(FMSUBADD)
35488 NODE_NAME_CASE(FMADD_RND)
35489 NODE_NAME_CASE(FNMADD_RND)
35490 NODE_NAME_CASE(FMSUB_RND)
35491 NODE_NAME_CASE(FNMSUB_RND)
35492 NODE_NAME_CASE(FMADDSUB_RND)
35493 NODE_NAME_CASE(FMSUBADD_RND)
35494 NODE_NAME_CASE(VFMADDC)
35495 NODE_NAME_CASE(VFMADDC_RND)
35496 NODE_NAME_CASE(VFCMADDC)
35497 NODE_NAME_CASE(VFCMADDC_RND)
35498 NODE_NAME_CASE(VFMULC)
35499 NODE_NAME_CASE(VFMULC_RND)
35500 NODE_NAME_CASE(VFCMULC)
35501 NODE_NAME_CASE(VFCMULC_RND)
35502 NODE_NAME_CASE(VFMULCSH)
35503 NODE_NAME_CASE(VFMULCSH_RND)
35504 NODE_NAME_CASE(VFCMULCSH)
35505 NODE_NAME_CASE(VFCMULCSH_RND)
35506 NODE_NAME_CASE(VFMADDCSH)
35507 NODE_NAME_CASE(VFMADDCSH_RND)
35508 NODE_NAME_CASE(VFCMADDCSH)
35509 NODE_NAME_CASE(VFCMADDCSH_RND)
35510 NODE_NAME_CASE(VPMADD52H)
35511 NODE_NAME_CASE(VPMADD52L)
35512 NODE_NAME_CASE(VRNDSCALE)
35513 NODE_NAME_CASE(STRICT_VRNDSCALE)
35514 NODE_NAME_CASE(VRNDSCALE_SAE)
35515 NODE_NAME_CASE(VRNDSCALES)
35516 NODE_NAME_CASE(VRNDSCALES_SAE)
35517 NODE_NAME_CASE(VREDUCE)
35518 NODE_NAME_CASE(VREDUCE_SAE)
35519 NODE_NAME_CASE(VREDUCES)
35520 NODE_NAME_CASE(VREDUCES_SAE)
35521 NODE_NAME_CASE(VGETMANT)
35522 NODE_NAME_CASE(VGETMANT_SAE)
35523 NODE_NAME_CASE(VGETMANTS)
35524 NODE_NAME_CASE(VGETMANTS_SAE)
35525 NODE_NAME_CASE(PCMPESTR)
35526 NODE_NAME_CASE(PCMPISTR)
35527 NODE_NAME_CASE(XTEST)
35528 NODE_NAME_CASE(COMPRESS)
35529 NODE_NAME_CASE(EXPAND)
35530 NODE_NAME_CASE(SELECTS)
35531 NODE_NAME_CASE(ADDSUB)
35532 NODE_NAME_CASE(RCP14)
35533 NODE_NAME_CASE(RCP14S)
35534 NODE_NAME_CASE(RCP28)
35535 NODE_NAME_CASE(RCP28_SAE)
35536 NODE_NAME_CASE(RCP28S)
35537 NODE_NAME_CASE(RCP28S_SAE)
35538 NODE_NAME_CASE(EXP2)
35539 NODE_NAME_CASE(EXP2_SAE)
35540 NODE_NAME_CASE(RSQRT14)
35541 NODE_NAME_CASE(RSQRT14S)
35542 NODE_NAME_CASE(RSQRT28)
35543 NODE_NAME_CASE(RSQRT28_SAE)
35544 NODE_NAME_CASE(RSQRT28S)
35545 NODE_NAME_CASE(RSQRT28S_SAE)
35546 NODE_NAME_CASE(FADD_RND)
35547 NODE_NAME_CASE(FADDS)
35548 NODE_NAME_CASE(FADDS_RND)
35549 NODE_NAME_CASE(FSUB_RND)
35550 NODE_NAME_CASE(FSUBS)
35551 NODE_NAME_CASE(FSUBS_RND)
35552 NODE_NAME_CASE(FMUL_RND)
35553 NODE_NAME_CASE(FMULS)
35554 NODE_NAME_CASE(FMULS_RND)
35555 NODE_NAME_CASE(FDIV_RND)
35556 NODE_NAME_CASE(FDIVS)
35557 NODE_NAME_CASE(FDIVS_RND)
35558 NODE_NAME_CASE(FSQRT_RND)
35559 NODE_NAME_CASE(FSQRTS)
35560 NODE_NAME_CASE(FSQRTS_RND)
35561 NODE_NAME_CASE(FGETEXP)
35562 NODE_NAME_CASE(FGETEXP_SAE)
35563 NODE_NAME_CASE(FGETEXPS)
35564 NODE_NAME_CASE(FGETEXPS_SAE)
35565 NODE_NAME_CASE(SCALEF)
35566 NODE_NAME_CASE(SCALEF_RND)
35567 NODE_NAME_CASE(SCALEFS)
35568 NODE_NAME_CASE(SCALEFS_RND)
35569 NODE_NAME_CASE(MULHRS)
35570 NODE_NAME_CASE(SINT_TO_FP_RND)
35571 NODE_NAME_CASE(UINT_TO_FP_RND)
35572 NODE_NAME_CASE(CVTTP2SI)
35573 NODE_NAME_CASE(CVTTP2UI)
35574 NODE_NAME_CASE(STRICT_CVTTP2SI)
35575 NODE_NAME_CASE(STRICT_CVTTP2UI)
35576 NODE_NAME_CASE(MCVTTP2SI)
35577 NODE_NAME_CASE(MCVTTP2UI)
35578 NODE_NAME_CASE(CVTTP2SI_SAE)
35579 NODE_NAME_CASE(CVTTP2UI_SAE)
35580 NODE_NAME_CASE(CVTTS2SI)
35581 NODE_NAME_CASE(CVTTS2UI)
35582 NODE_NAME_CASE(CVTTS2SI_SAE)
35583 NODE_NAME_CASE(CVTTS2UI_SAE)
35584 NODE_NAME_CASE(CVTSI2P)
35585 NODE_NAME_CASE(CVTUI2P)
35586 NODE_NAME_CASE(STRICT_CVTSI2P)
35587 NODE_NAME_CASE(STRICT_CVTUI2P)
35588 NODE_NAME_CASE(MCVTSI2P)
35589 NODE_NAME_CASE(MCVTUI2P)
35590 NODE_NAME_CASE(VFPCLASS)
35591 NODE_NAME_CASE(VFPCLASSS)
35592 NODE_NAME_CASE(MULTISHIFT)
35593 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35594 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35595 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35596 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35597 NODE_NAME_CASE(CVTPS2PH)
35598 NODE_NAME_CASE(STRICT_CVTPS2PH)
35599 NODE_NAME_CASE(CVTPS2PH_SAE)
35600 NODE_NAME_CASE(MCVTPS2PH)
35601 NODE_NAME_CASE(MCVTPS2PH_SAE)
35602 NODE_NAME_CASE(CVTPH2PS)
35603 NODE_NAME_CASE(STRICT_CVTPH2PS)
35604 NODE_NAME_CASE(CVTPH2PS_SAE)
35605 NODE_NAME_CASE(CVTP2SI)
35606 NODE_NAME_CASE(CVTP2UI)
35607 NODE_NAME_CASE(MCVTP2SI)
35608 NODE_NAME_CASE(MCVTP2UI)
35609 NODE_NAME_CASE(CVTP2SI_RND)
35610 NODE_NAME_CASE(CVTP2UI_RND)
35611 NODE_NAME_CASE(CVTS2SI)
35612 NODE_NAME_CASE(CVTS2UI)
35613 NODE_NAME_CASE(CVTS2SI_RND)
35614 NODE_NAME_CASE(CVTS2UI_RND)
35615 NODE_NAME_CASE(CVTNE2PS2BF16)
35616 NODE_NAME_CASE(CVTNEPS2BF16)
35617 NODE_NAME_CASE(MCVTNEPS2BF16)
35618 NODE_NAME_CASE(DPBF16PS)
35619 NODE_NAME_CASE(LWPINS)
35620 NODE_NAME_CASE(MGATHER)
35621 NODE_NAME_CASE(MSCATTER)
35622 NODE_NAME_CASE(VPDPBUSD)
35623 NODE_NAME_CASE(VPDPBUSDS)
35624 NODE_NAME_CASE(VPDPWSSD)
35625 NODE_NAME_CASE(VPDPWSSDS)
35626 NODE_NAME_CASE(VPSHUFBITQMB)
35627 NODE_NAME_CASE(GF2P8MULB)
35628 NODE_NAME_CASE(GF2P8AFFINEQB)
35629 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35630 NODE_NAME_CASE(NT_CALL)
35631 NODE_NAME_CASE(NT_BRIND)
35632 NODE_NAME_CASE(UMWAIT)
35633 NODE_NAME_CASE(TPAUSE)
35634 NODE_NAME_CASE(ENQCMD)
35635 NODE_NAME_CASE(ENQCMDS)
35636 NODE_NAME_CASE(VP2INTERSECT)
35637 NODE_NAME_CASE(VPDPBSUD)
35638 NODE_NAME_CASE(VPDPBSUDS)
35639 NODE_NAME_CASE(VPDPBUUD)
35640 NODE_NAME_CASE(VPDPBUUDS)
35641 NODE_NAME_CASE(VPDPBSSD)
35642 NODE_NAME_CASE(VPDPBSSDS)
35643 NODE_NAME_CASE(AESENC128KL)
35644 NODE_NAME_CASE(AESDEC128KL)
35645 NODE_NAME_CASE(AESENC256KL)
35646 NODE_NAME_CASE(AESDEC256KL)
35647 NODE_NAME_CASE(AESENCWIDE128KL)
35648 NODE_NAME_CASE(AESDECWIDE128KL)
35649 NODE_NAME_CASE(AESENCWIDE256KL)
35650 NODE_NAME_CASE(AESDECWIDE256KL)
35651 NODE_NAME_CASE(CMPCCXADD)
35652 NODE_NAME_CASE(TESTUI)
35653 NODE_NAME_CASE(FP80_ADD)
35654 NODE_NAME_CASE(STRICT_FP80_ADD)
35655 }
35656 return nullptr;
35657#undef NODE_NAME_CASE
35658}
35659
35660/// Return true if the addressing mode represented by AM is legal for this
35661/// target, for a load/store of the specified type.
35662bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
35663 const AddrMode &AM, Type *Ty,
35664 unsigned AS,
35665 Instruction *I) const {
35666 // X86 supports extremely general addressing modes.
35667 CodeModel::Model M = getTargetMachine().getCodeModel();
35668
35669 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35670 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35671 return false;
35672
35673 if (AM.BaseGV) {
35674 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35675
35676 // If a reference to this global requires an extra load, we can't fold it.
35677 if (isGlobalStubReference(GVFlags))
35678 return false;
35679
35680 // If BaseGV requires a register for the PIC base, we cannot also have a
35681 // BaseReg specified.
35682 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35683 return false;
35684
35685 // If lower 4G is not available, then we must use rip-relative addressing.
35686 if ((M != CodeModel::Small || isPositionIndependent()) &&
35687 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35688 return false;
35689 }
35690
35691 switch (AM.Scale) {
35692 case 0:
35693 case 1:
35694 case 2:
35695 case 4:
35696 case 8:
35697 // These scales always work.
35698 break;
35699 case 3:
35700 case 5:
35701 case 9:
35702 // These scales are formed with basereg+scalereg. Only accept if there is
35703 // no basereg yet.
35704 if (AM.HasBaseReg)
35705 return false;
35706 break;
35707 default: // Other stuff never works.
35708 return false;
35709 }
35710
35711 return true;
35712}
35713
35714bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
35715 unsigned Bits = Ty->getScalarSizeInBits();
35716
35717 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
35718 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
35719 if (Subtarget.hasXOP() &&
35720 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
35721 return false;
35722
35723 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
35724 // shifts just as cheap as scalar ones.
35725 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
35726 return false;
35727
35728 // AVX512BW has shifts such as vpsllvw.
35729 if (Subtarget.hasBWI() && Bits == 16)
35730 return false;
35731
35732 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
35733 // fully general vector.
35734 return true;
35735}
35736
35737bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35738 switch (Opcode) {
35739 // These are non-commutative binops.
35740 // TODO: Add more X86ISD opcodes once we have test coverage.
35741 case X86ISD::ANDNP:
35742 case X86ISD::PCMPGT:
35743 case X86ISD::FMAX:
35744 case X86ISD::FMIN:
35745 case X86ISD::FANDN:
35746 case X86ISD::VPSHA:
35747 case X86ISD::VPSHL:
35748 case X86ISD::VSHLV:
35749 case X86ISD::VSRLV:
35750 case X86ISD::VSRAV:
35751 return true;
35752 }
35753
35754 return TargetLoweringBase::isBinOp(Opcode);
35755}
35756
35757bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35758 switch (Opcode) {
35759 // TODO: Add more X86ISD opcodes once we have test coverage.
35760 case X86ISD::PCMPEQ:
35761 case X86ISD::PMULDQ:
35762 case X86ISD::PMULUDQ:
35763 case X86ISD::FMAXC:
35764 case X86ISD::FMINC:
35765 case X86ISD::FAND:
35766 case X86ISD::FOR:
35767 case X86ISD::FXOR:
35768 return true;
35769 }
35770
35771 return TargetLoweringBase::isCommutativeBinOp(Opcode);
35772}
35773
35774bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
35775 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35776 return false;
35777 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35778 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35779 return NumBits1 > NumBits2;
35780}
35781
35782bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
35783 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35784 return false;
35785
35786 if (!isTypeLegal(EVT::getEVT(Ty1)))
35787 return false;
35788
35789 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35789, __extension__
__PRETTY_FUNCTION__))
;
35790
35791 // Assuming the caller doesn't have a zeroext or signext return parameter,
35792 // truncation all the way down to i1 is valid.
35793 return true;
35794}
35795
35796bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
35797 return isInt<32>(Imm);
35798}
35799
35800bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
35801 // Can also use sub to handle negated immediates.
35802 return isInt<32>(Imm);
35803}
35804
35805bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
35806 return isInt<32>(Imm);
35807}
35808
35809bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
35810 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35811 return false;
35812 unsigned NumBits1 = VT1.getSizeInBits();
35813 unsigned NumBits2 = VT2.getSizeInBits();
35814 return NumBits1 > NumBits2;
35815}
35816
35817bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
35818 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35819 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35820}
35821
35822bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
35823 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35824 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35825}
35826
35827bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
35828 EVT VT1 = Val.getValueType();
35829 if (isZExtFree(VT1, VT2))
35830 return true;
35831
35832 if (Val.getOpcode() != ISD::LOAD)
35833 return false;
35834
35835 if (!VT1.isSimple() || !VT1.isInteger() ||
35836 !VT2.isSimple() || !VT2.isInteger())
35837 return false;
35838
35839 switch (VT1.getSimpleVT().SimpleTy) {
35840 default: break;
35841 case MVT::i8:
35842 case MVT::i16:
35843 case MVT::i32:
35844 // X86 has 8, 16, and 32-bit zero-extending loads.
35845 return true;
35846 }
35847
35848 return false;
35849}
35850
35851bool X86TargetLowering::shouldSinkOperands(Instruction *I,
35852 SmallVectorImpl<Use *> &Ops) const {
35853 using namespace llvm::PatternMatch;
35854
35855 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
35856 if (!VTy)
35857 return false;
35858
35859 if (I->getOpcode() == Instruction::Mul &&
35860 VTy->getElementType()->isIntegerTy(64)) {
35861 for (auto &Op : I->operands()) {
35862 // Make sure we are not already sinking this operand
35863 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
35864 continue;
35865
35866 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
35867 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
35868 if (Subtarget.hasSSE41() &&
35869 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
35870 m_SpecificInt(32)))) {
35871 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
35872 Ops.push_back(&Op);
35873 } else if (Subtarget.hasSSE2() &&
35874 match(Op.get(),
35875 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
35876 Ops.push_back(&Op);
35877 }
35878 }
35879
35880 return !Ops.empty();
35881 }
35882
35883 // A uniform shift amount in a vector shift or funnel shift may be much
35884 // cheaper than a generic variable vector shift, so make that pattern visible
35885 // to SDAG by sinking the shuffle instruction next to the shift.
35886 int ShiftAmountOpNum = -1;
35887 if (I->isShift())
35888 ShiftAmountOpNum = 1;
35889 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
35890 if (II->getIntrinsicID() == Intrinsic::fshl ||
35891 II->getIntrinsicID() == Intrinsic::fshr)
35892 ShiftAmountOpNum = 2;
35893 }
35894
35895 if (ShiftAmountOpNum == -1)
35896 return false;
35897
35898 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
35899 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
35900 isVectorShiftByScalarCheap(I->getType())) {
35901 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
35902 return true;
35903 }
35904
35905 return false;
35906}
35907
35908bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
35909 if (!Subtarget.is64Bit())
35910 return false;
35911 return TargetLowering::shouldConvertPhiType(From, To);
35912}
35913
35914bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
35915 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35916 return false;
35917
35918 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35919
35920 // There is no extending load for vXi1.
35921 if (SrcVT.getScalarType() == MVT::i1)
35922 return false;
35923
35924 return true;
35925}
35926
35927bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
35928 EVT VT) const {
35929 if (!Subtarget.hasAnyFMA())
35930 return false;
35931
35932 VT = VT.getScalarType();
35933
35934 if (!VT.isSimple())
35935 return false;
35936
35937 switch (VT.getSimpleVT().SimpleTy) {
35938 case MVT::f16:
35939 return Subtarget.hasFP16();
35940 case MVT::f32:
35941 case MVT::f64:
35942 return true;
35943 default:
35944 break;
35945 }
35946
35947 return false;
35948}
35949
35950bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
35951 // i16 instructions are longer (0x66 prefix) and potentially slower.
35952 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35953}
35954
35955bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
35956 EVT VT) const {
35957 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35958 // benefit. The transform may also be profitable for scalar code.
35959 if (!Subtarget.hasAVX512())
35960 return false;
35961 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35962 return false;
35963 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35964 return false;
35965
35966 return true;
35967}
35968
35969/// Targets can use this to indicate that they only support *some*
35970/// VECTOR_SHUFFLE operations, those with specific masks.
35971/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35972/// are assumed to be legal.
35973bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
35974 if (!VT.isSimple())
35975 return false;
35976
35977 // Not for i1 vectors
35978 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35979 return false;
35980
35981 // Very little shuffling can be done for 64-bit vectors right now.
35982 if (VT.getSimpleVT().getSizeInBits() == 64)
35983 return false;
35984
35985 // We only care that the types being shuffled are legal. The lowering can
35986 // handle any possible shuffle mask that results.
35987 return isTypeLegal(VT.getSimpleVT());
35988}
35989
35990bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
35991 EVT VT) const {
35992 // Don't convert an 'and' into a shuffle that we don't directly support.
35993 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35994 if (!Subtarget.hasAVX2())
35995 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35996 return false;
35997
35998 // Just delegate to the generic legality, clear masks aren't special.
35999 return isShuffleMaskLegal(Mask, VT);
36000}
36001
36002bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
36003 // If the subtarget is using thunks, we need to not generate jump tables.
36004 if (Subtarget.useIndirectThunkBranches())
36005 return false;
36006
36007 // Otherwise, fallback on the generic logic.
36008 return TargetLowering::areJTsAllowed(Fn);
36009}
36010
36011MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
36012 EVT ConditionVT) const {
36013 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
36014 // zero-extensions.
36015 if (ConditionVT.getSizeInBits() < 32)
36016 return MVT::i32;
36017 return TargetLoweringBase::getPreferredSwitchConditionType(Context,
36018 ConditionVT);
36019}
36020
36021//===----------------------------------------------------------------------===//
36022// X86 Scheduler Hooks
36023//===----------------------------------------------------------------------===//
36024
36025// Returns true if EFLAG is consumed after this iterator in the rest of the
36026// basic block or any successors of the basic block.
36027static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
36028 MachineBasicBlock *BB) {
36029 // Scan forward through BB for a use/def of EFLAGS.
36030 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
36031 if (mi.readsRegister(X86::EFLAGS))
36032 return true;
36033 // If we found a def, we can stop searching.
36034 if (mi.definesRegister(X86::EFLAGS))
36035 return false;
36036 }
36037
36038 // If we hit the end of the block, check whether EFLAGS is live into a
36039 // successor.
36040 for (MachineBasicBlock *Succ : BB->successors())
36041 if (Succ->isLiveIn(X86::EFLAGS))
36042 return true;
36043
36044 return false;
36045}
36046
36047/// Utility function to emit xbegin specifying the start of an RTM region.
36048static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
36049 const TargetInstrInfo *TII) {
36050 const DebugLoc &DL = MI.getDebugLoc();
36051
36052 const BasicBlock *BB = MBB->getBasicBlock();
36053 MachineFunction::iterator I = ++MBB->getIterator();
36054
36055 // For the v = xbegin(), we generate
36056 //
36057 // thisMBB:
36058 // xbegin sinkMBB
36059 //
36060 // mainMBB:
36061 // s0 = -1
36062 //
36063 // fallBB:
36064 // eax = # XABORT_DEF
36065 // s1 = eax
36066 //
36067 // sinkMBB:
36068 // v = phi(s0/mainBB, s1/fallBB)
36069
36070 MachineBasicBlock *thisMBB = MBB;
36071 MachineFunction *MF = MBB->getParent();
36072 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36073 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36074 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36075 MF->insert(I, mainMBB);
36076 MF->insert(I, fallMBB);
36077 MF->insert(I, sinkMBB);
36078
36079 if (isEFLAGSLiveAfter(MI, MBB)) {
36080 mainMBB->addLiveIn(X86::EFLAGS);
36081 fallMBB->addLiveIn(X86::EFLAGS);
36082 sinkMBB->addLiveIn(X86::EFLAGS);
36083 }
36084
36085 // Transfer the remainder of BB and its successor edges to sinkMBB.
36086 sinkMBB->splice(sinkMBB->begin(), MBB,
36087 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36088 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36089
36090 MachineRegisterInfo &MRI = MF->getRegInfo();
36091 Register DstReg = MI.getOperand(0).getReg();
36092 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
36093 Register mainDstReg = MRI.createVirtualRegister(RC);
36094 Register fallDstReg = MRI.createVirtualRegister(RC);
36095
36096 // thisMBB:
36097 // xbegin fallMBB
36098 // # fallthrough to mainMBB
36099 // # abortion to fallMBB
36100 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
36101 thisMBB->addSuccessor(mainMBB);
36102 thisMBB->addSuccessor(fallMBB);
36103
36104 // mainMBB:
36105 // mainDstReg := -1
36106 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
36107 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36108 mainMBB->addSuccessor(sinkMBB);
36109
36110 // fallMBB:
36111 // ; pseudo instruction to model hardware's definition from XABORT
36112 // EAX := XABORT_DEF
36113 // fallDstReg := EAX
36114 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
36115 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
36116 .addReg(X86::EAX);
36117 fallMBB->addSuccessor(sinkMBB);
36118
36119 // sinkMBB:
36120 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
36121 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
36122 .addReg(mainDstReg).addMBB(mainMBB)
36123 .addReg(fallDstReg).addMBB(fallMBB);
36124
36125 MI.eraseFromParent();
36126 return sinkMBB;
36127}
36128
36129MachineBasicBlock *
36130X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
36131 MachineBasicBlock *MBB) const {
36132 // Emit va_arg instruction on X86-64.
36133
36134 // Operands to this pseudo-instruction:
36135 // 0 ) Output : destination address (reg)
36136 // 1-5) Input : va_list address (addr, i64mem)
36137 // 6 ) ArgSize : Size (in bytes) of vararg type
36138 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
36139 // 8 ) Align : Alignment of type
36140 // 9 ) EFLAGS (implicit-def)
36141
36142 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36142, __extension__
__PRETTY_FUNCTION__))
;
36143 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
36144
36145 Register DestReg = MI.getOperand(0).getReg();
36146 MachineOperand &Base = MI.getOperand(1);
36147 MachineOperand &Scale = MI.getOperand(2);
36148 MachineOperand &Index = MI.getOperand(3);
36149 MachineOperand &Disp = MI.getOperand(4);
36150 MachineOperand &Segment = MI.getOperand(5);
36151 unsigned ArgSize = MI.getOperand(6).getImm();
36152 unsigned ArgMode = MI.getOperand(7).getImm();
36153 Align Alignment = Align(MI.getOperand(8).getImm());
36154
36155 MachineFunction *MF = MBB->getParent();
36156
36157 // Memory Reference
36158 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36158, __extension__
__PRETTY_FUNCTION__))
;
36159
36160 MachineMemOperand *OldMMO = MI.memoperands().front();
36161
36162 // Clone the MMO into two separate MMOs for loading and storing
36163 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
36164 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
36165 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
36166 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
36167
36168 // Machine Information
36169 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36170 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
36171 const TargetRegisterClass *AddrRegClass =
36172 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
36173 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
36174 const DebugLoc &DL = MI.getDebugLoc();
36175
36176 // struct va_list {
36177 // i32 gp_offset
36178 // i32 fp_offset
36179 // i64 overflow_area (address)
36180 // i64 reg_save_area (address)
36181 // }
36182 // sizeof(va_list) = 24
36183 // alignment(va_list) = 8
36184
36185 unsigned TotalNumIntRegs = 6;
36186 unsigned TotalNumXMMRegs = 8;
36187 bool UseGPOffset = (ArgMode == 1);
36188 bool UseFPOffset = (ArgMode == 2);
36189 unsigned MaxOffset = TotalNumIntRegs * 8 +
36190 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
36191
36192 /* Align ArgSize to a multiple of 8 */
36193 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
36194 bool NeedsAlign = (Alignment > 8);
36195
36196 MachineBasicBlock *thisMBB = MBB;
36197 MachineBasicBlock *overflowMBB;
36198 MachineBasicBlock *offsetMBB;
36199 MachineBasicBlock *endMBB;
36200
36201 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
36202 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
36203 unsigned OffsetReg = 0;
36204
36205 if (!UseGPOffset && !UseFPOffset) {
36206 // If we only pull from the overflow region, we don't create a branch.
36207 // We don't need to alter control flow.
36208 OffsetDestReg = 0; // unused
36209 OverflowDestReg = DestReg;
36210
36211 offsetMBB = nullptr;
36212 overflowMBB = thisMBB;
36213 endMBB = thisMBB;
36214 } else {
36215 // First emit code to check if gp_offset (or fp_offset) is below the bound.
36216 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
36217 // If not, pull from overflow_area. (branch to overflowMBB)
36218 //
36219 // thisMBB
36220 // | .
36221 // | .
36222 // offsetMBB overflowMBB
36223 // | .
36224 // | .
36225 // endMBB
36226
36227 // Registers for the PHI in endMBB
36228 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
36229 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
36230
36231 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36232 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36233 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36234 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36235
36236 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36237
36238 // Insert the new basic blocks
36239 MF->insert(MBBIter, offsetMBB);
36240 MF->insert(MBBIter, overflowMBB);
36241 MF->insert(MBBIter, endMBB);
36242
36243 // Transfer the remainder of MBB and its successor edges to endMBB.
36244 endMBB->splice(endMBB->begin(), thisMBB,
36245 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
36246 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
36247
36248 // Make offsetMBB and overflowMBB successors of thisMBB
36249 thisMBB->addSuccessor(offsetMBB);
36250 thisMBB->addSuccessor(overflowMBB);
36251
36252 // endMBB is a successor of both offsetMBB and overflowMBB
36253 offsetMBB->addSuccessor(endMBB);
36254 overflowMBB->addSuccessor(endMBB);
36255
36256 // Load the offset value into a register
36257 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36258 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
36259 .add(Base)
36260 .add(Scale)
36261 .add(Index)
36262 .addDisp(Disp, UseFPOffset ? 4 : 0)
36263 .add(Segment)
36264 .setMemRefs(LoadOnlyMMO);
36265
36266 // Check if there is enough room left to pull this argument.
36267 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
36268 .addReg(OffsetReg)
36269 .addImm(MaxOffset + 8 - ArgSizeA8);
36270
36271 // Branch to "overflowMBB" if offset >= max
36272 // Fall through to "offsetMBB" otherwise
36273 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
36274 .addMBB(overflowMBB).addImm(X86::COND_AE);
36275 }
36276
36277 // In offsetMBB, emit code to use the reg_save_area.
36278 if (offsetMBB) {
36279 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36279, __extension__ __PRETTY_FUNCTION__))
;
36280
36281 // Read the reg_save_area address.
36282 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
36283 BuildMI(
36284 offsetMBB, DL,
36285 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36286 RegSaveReg)
36287 .add(Base)
36288 .add(Scale)
36289 .add(Index)
36290 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
36291 .add(Segment)
36292 .setMemRefs(LoadOnlyMMO);
36293
36294 if (Subtarget.isTarget64BitLP64()) {
36295 // Zero-extend the offset
36296 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
36297 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
36298 .addImm(0)
36299 .addReg(OffsetReg)
36300 .addImm(X86::sub_32bit);
36301
36302 // Add the offset to the reg_save_area to get the final address.
36303 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
36304 .addReg(OffsetReg64)
36305 .addReg(RegSaveReg);
36306 } else {
36307 // Add the offset to the reg_save_area to get the final address.
36308 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
36309 .addReg(OffsetReg)
36310 .addReg(RegSaveReg);
36311 }
36312
36313 // Compute the offset for the next argument
36314 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36315 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
36316 .addReg(OffsetReg)
36317 .addImm(UseFPOffset ? 16 : 8);
36318
36319 // Store it back into the va_list.
36320 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
36321 .add(Base)
36322 .add(Scale)
36323 .add(Index)
36324 .addDisp(Disp, UseFPOffset ? 4 : 0)
36325 .add(Segment)
36326 .addReg(NextOffsetReg)
36327 .setMemRefs(StoreOnlyMMO);
36328
36329 // Jump to endMBB
36330 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
36331 .addMBB(endMBB);
36332 }
36333
36334 //
36335 // Emit code to use overflow area
36336 //
36337
36338 // Load the overflow_area address into a register.
36339 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36340 BuildMI(overflowMBB, DL,
36341 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36342 OverflowAddrReg)
36343 .add(Base)
36344 .add(Scale)
36345 .add(Index)
36346 .addDisp(Disp, 8)
36347 .add(Segment)
36348 .setMemRefs(LoadOnlyMMO);
36349
36350 // If we need to align it, do so. Otherwise, just copy the address
36351 // to OverflowDestReg.
36352 if (NeedsAlign) {
36353 // Align the overflow address
36354 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36355
36356 // aligned_addr = (addr + (align-1)) & ~(align-1)
36357 BuildMI(
36358 overflowMBB, DL,
36359 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36360 TmpReg)
36361 .addReg(OverflowAddrReg)
36362 .addImm(Alignment.value() - 1);
36363
36364 BuildMI(
36365 overflowMBB, DL,
36366 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36367 OverflowDestReg)
36368 .addReg(TmpReg)
36369 .addImm(~(uint64_t)(Alignment.value() - 1));
36370 } else {
36371 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
36372 .addReg(OverflowAddrReg);
36373 }
36374
36375 // Compute the next overflow address after this argument.
36376 // (the overflow address should be kept 8-byte aligned)
36377 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36378 BuildMI(
36379 overflowMBB, DL,
36380 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36381 NextAddrReg)
36382 .addReg(OverflowDestReg)
36383 .addImm(ArgSizeA8);
36384
36385 // Store the new overflow address.
36386 BuildMI(overflowMBB, DL,
36387 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36388 .add(Base)
36389 .add(Scale)
36390 .add(Index)
36391 .addDisp(Disp, 8)
36392 .add(Segment)
36393 .addReg(NextAddrReg)
36394 .setMemRefs(StoreOnlyMMO);
36395
36396 // If we branched, emit the PHI to the front of endMBB.
36397 if (offsetMBB) {
36398 BuildMI(*endMBB, endMBB->begin(), DL,
36399 TII->get(X86::PHI), DestReg)
36400 .addReg(OffsetDestReg).addMBB(offsetMBB)
36401 .addReg(OverflowDestReg).addMBB(overflowMBB);
36402 }
36403
36404 // Erase the pseudo instruction
36405 MI.eraseFromParent();
36406
36407 return endMBB;
36408}
36409
36410// The EFLAGS operand of SelectItr might be missing a kill marker
36411// because there were multiple uses of EFLAGS, and ISel didn't know
36412// which to mark. Figure out whether SelectItr should have had a
36413// kill marker, and set it if it should. Returns the correct kill
36414// marker value.
36415static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
36416 MachineBasicBlock* BB,
36417 const TargetRegisterInfo* TRI) {
36418 if (isEFLAGSLiveAfter(SelectItr, BB))
36419 return false;
36420
36421 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36422 // out. SelectMI should have a kill flag on EFLAGS.
36423 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36424 return true;
36425}
36426
36427// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36428// together with other CMOV pseudo-opcodes into a single basic-block with
36429// conditional jump around it.
36430static bool isCMOVPseudo(MachineInstr &MI) {
36431 switch (MI.getOpcode()) {
36432 case X86::CMOV_FR16:
36433 case X86::CMOV_FR16X:
36434 case X86::CMOV_FR32:
36435 case X86::CMOV_FR32X:
36436 case X86::CMOV_FR64:
36437 case X86::CMOV_FR64X:
36438 case X86::CMOV_GR8:
36439 case X86::CMOV_GR16:
36440 case X86::CMOV_GR32:
36441 case X86::CMOV_RFP32:
36442 case X86::CMOV_RFP64:
36443 case X86::CMOV_RFP80:
36444 case X86::CMOV_VR64:
36445 case X86::CMOV_VR128:
36446 case X86::CMOV_VR128X:
36447 case X86::CMOV_VR256:
36448 case X86::CMOV_VR256X:
36449 case X86::CMOV_VR512:
36450 case X86::CMOV_VK1:
36451 case X86::CMOV_VK2:
36452 case X86::CMOV_VK4:
36453 case X86::CMOV_VK8:
36454 case X86::CMOV_VK16:
36455 case X86::CMOV_VK32:
36456 case X86::CMOV_VK64:
36457 return true;
36458
36459 default:
36460 return false;
36461 }
36462}
36463
36464// Helper function, which inserts PHI functions into SinkMBB:
36465// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36466// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36467// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36468// the last PHI function inserted.
36469static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
36470 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
36471 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36472 MachineBasicBlock *SinkMBB) {
36473 MachineFunction *MF = TrueMBB->getParent();
36474 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
36475 const DebugLoc &DL = MIItBegin->getDebugLoc();
36476
36477 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36478 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36479
36480 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36481
36482 // As we are creating the PHIs, we have to be careful if there is more than
36483 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36484 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36485 // That also means that PHI construction must work forward from earlier to
36486 // later, and that the code must maintain a mapping from earlier PHI's
36487 // destination registers, and the registers that went into the PHI.
36488 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
36489 MachineInstrBuilder MIB;
36490
36491 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36492 Register DestReg = MIIt->getOperand(0).getReg();
36493 Register Op1Reg = MIIt->getOperand(1).getReg();
36494 Register Op2Reg = MIIt->getOperand(2).getReg();
36495
36496 // If this CMOV we are generating is the opposite condition from
36497 // the jump we generated, then we have to swap the operands for the
36498 // PHI that is going to be generated.
36499 if (MIIt->getOperand(3).getImm() == OppCC)
36500 std::swap(Op1Reg, Op2Reg);
36501
36502 if (RegRewriteTable.contains(Op1Reg))
36503 Op1Reg = RegRewriteTable[Op1Reg].first;
36504
36505 if (RegRewriteTable.contains(Op2Reg))
36506 Op2Reg = RegRewriteTable[Op2Reg].second;
36507
36508 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
36509 .addReg(Op1Reg)
36510 .addMBB(FalseMBB)
36511 .addReg(Op2Reg)
36512 .addMBB(TrueMBB);
36513
36514 // Add this PHI to the rewrite table.
36515 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36516 }
36517
36518 return MIB;
36519}
36520
36521// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36522MachineBasicBlock *
36523X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36524 MachineInstr &SecondCascadedCMOV,
36525 MachineBasicBlock *ThisMBB) const {
36526 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36527 const DebugLoc &DL = FirstCMOV.getDebugLoc();
36528
36529 // We lower cascaded CMOVs such as
36530 //
36531 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36532 //
36533 // to two successive branches.
36534 //
36535 // Without this, we would add a PHI between the two jumps, which ends up
36536 // creating a few copies all around. For instance, for
36537 //
36538 // (sitofp (zext (fcmp une)))
36539 //
36540 // we would generate:
36541 //
36542 // ucomiss %xmm1, %xmm0
36543 // movss <1.0f>, %xmm0
36544 // movaps %xmm0, %xmm1
36545 // jne .LBB5_2
36546 // xorps %xmm1, %xmm1
36547 // .LBB5_2:
36548 // jp .LBB5_4
36549 // movaps %xmm1, %xmm0
36550 // .LBB5_4:
36551 // retq
36552 //
36553 // because this custom-inserter would have generated:
36554 //
36555 // A
36556 // | \
36557 // | B
36558 // | /
36559 // C
36560 // | \
36561 // | D
36562 // | /
36563 // E
36564 //
36565 // A: X = ...; Y = ...
36566 // B: empty
36567 // C: Z = PHI [X, A], [Y, B]
36568 // D: empty
36569 // E: PHI [X, C], [Z, D]
36570 //
36571 // If we lower both CMOVs in a single step, we can instead generate:
36572 //
36573 // A
36574 // | \
36575 // | C
36576 // | /|
36577 // |/ |
36578 // | |
36579 // | D
36580 // | /
36581 // E
36582 //
36583 // A: X = ...; Y = ...
36584 // D: empty
36585 // E: PHI [X, A], [X, C], [Y, D]
36586 //
36587 // Which, in our sitofp/fcmp example, gives us something like:
36588 //
36589 // ucomiss %xmm1, %xmm0
36590 // movss <1.0f>, %xmm0
36591 // jne .LBB5_4
36592 // jp .LBB5_4
36593 // xorps %xmm0, %xmm0
36594 // .LBB5_4:
36595 // retq
36596 //
36597
36598 // We lower cascaded CMOV into two successive branches to the same block.
36599 // EFLAGS is used by both, so mark it as live in the second.
36600 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36601 MachineFunction *F = ThisMBB->getParent();
36602 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36603 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36604 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36605
36606 MachineFunction::iterator It = ++ThisMBB->getIterator();
36607 F->insert(It, FirstInsertedMBB);
36608 F->insert(It, SecondInsertedMBB);
36609 F->insert(It, SinkMBB);
36610
36611 // For a cascaded CMOV, we lower it to two successive branches to
36612 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36613 // the FirstInsertedMBB.
36614 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36615
36616 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36617 // live into the sink and copy blocks.
36618 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36619 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
36620 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36621 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36622 SinkMBB->addLiveIn(X86::EFLAGS);
36623 }
36624
36625 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36626 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36627 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36628 ThisMBB->end());
36629 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36630
36631 // Fallthrough block for ThisMBB.
36632 ThisMBB->addSuccessor(FirstInsertedMBB);
36633 // The true block target of the first branch is always SinkMBB.
36634 ThisMBB->addSuccessor(SinkMBB);
36635 // Fallthrough block for FirstInsertedMBB.
36636 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36637 // The true block for the branch of FirstInsertedMBB.
36638 FirstInsertedMBB->addSuccessor(SinkMBB);
36639 // This is fallthrough.
36640 SecondInsertedMBB->addSuccessor(SinkMBB);
36641
36642 // Create the conditional branch instructions.
36643 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36644 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36645
36646 X86::CondCode SecondCC =
36647 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36648 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
36649
36650 // SinkMBB:
36651 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36652 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36653 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36654 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36655 MachineInstrBuilder MIB =
36656 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
36657 .addReg(Op1Reg)
36658 .addMBB(SecondInsertedMBB)
36659 .addReg(Op2Reg)
36660 .addMBB(ThisMBB);
36661
36662 // The second SecondInsertedMBB provides the same incoming value as the
36663 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36664 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36665
36666 // Now remove the CMOVs.
36667 FirstCMOV.eraseFromParent();
36668 SecondCascadedCMOV.eraseFromParent();
36669
36670 return SinkMBB;
36671}
36672
36673MachineBasicBlock *
36674X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36675 MachineBasicBlock *ThisMBB) const {
36676 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36677 const DebugLoc &DL = MI.getDebugLoc();
36678
36679 // To "insert" a SELECT_CC instruction, we actually have to insert the
36680 // diamond control-flow pattern. The incoming instruction knows the
36681 // destination vreg to set, the condition code register to branch on, the
36682 // true/false values to select between and a branch opcode to use.
36683
36684 // ThisMBB:
36685 // ...
36686 // TrueVal = ...
36687 // cmpTY ccX, r1, r2
36688 // bCC copy1MBB
36689 // fallthrough --> FalseMBB
36690
36691 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36692 // as described above, by inserting a BB, and then making a PHI at the join
36693 // point to select the true and false operands of the CMOV in the PHI.
36694 //
36695 // The code also handles two different cases of multiple CMOV opcodes
36696 // in a row.
36697 //
36698 // Case 1:
36699 // In this case, there are multiple CMOVs in a row, all which are based on
36700 // the same condition setting (or the exact opposite condition setting).
36701 // In this case we can lower all the CMOVs using a single inserted BB, and
36702 // then make a number of PHIs at the join point to model the CMOVs. The only
36703 // trickiness here, is that in a case like:
36704 //
36705 // t2 = CMOV cond1 t1, f1
36706 // t3 = CMOV cond1 t2, f2
36707 //
36708 // when rewriting this into PHIs, we have to perform some renaming on the
36709 // temps since you cannot have a PHI operand refer to a PHI result earlier
36710 // in the same block. The "simple" but wrong lowering would be:
36711 //
36712 // t2 = PHI t1(BB1), f1(BB2)
36713 // t3 = PHI t2(BB1), f2(BB2)
36714 //
36715 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36716 // renaming is to note that on the path through BB1, t2 is really just a
36717 // copy of t1, and do that renaming, properly generating:
36718 //
36719 // t2 = PHI t1(BB1), f1(BB2)
36720 // t3 = PHI t1(BB1), f2(BB2)
36721 //
36722 // Case 2:
36723 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36724 // function - EmitLoweredCascadedSelect.
36725
36726 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36727 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36728 MachineInstr *LastCMOV = &MI;
36729 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
36730
36731 // Check for case 1, where there are multiple CMOVs with the same condition
36732 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36733 // number of jumps the most.
36734
36735 if (isCMOVPseudo(MI)) {
36736 // See if we have a string of CMOVS with the same condition. Skip over
36737 // intervening debug insts.
36738 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36739 (NextMIIt->getOperand(3).getImm() == CC ||
36740 NextMIIt->getOperand(3).getImm() == OppCC)) {
36741 LastCMOV = &*NextMIIt;
36742 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36743 }
36744 }
36745
36746 // This checks for case 2, but only do this if we didn't already find
36747 // case 1, as indicated by LastCMOV == MI.
36748 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36749 NextMIIt->getOpcode() == MI.getOpcode() &&
36750 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36751 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36752 NextMIIt->getOperand(1).isKill()) {
36753 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36754 }
36755
36756 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36757 MachineFunction *F = ThisMBB->getParent();
36758 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36759 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36760
36761 MachineFunction::iterator It = ++ThisMBB->getIterator();
36762 F->insert(It, FalseMBB);
36763 F->insert(It, SinkMBB);
36764
36765 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36766 // live into the sink and copy blocks.
36767 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36768 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
36769 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36770 FalseMBB->addLiveIn(X86::EFLAGS);
36771 SinkMBB->addLiveIn(X86::EFLAGS);
36772 }
36773
36774 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36775 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
36776 MachineBasicBlock::iterator(LastCMOV));
36777 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36778 if (MI.isDebugInstr())
36779 SinkMBB->push_back(MI.removeFromParent());
36780
36781 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36782 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36783 std::next(MachineBasicBlock::iterator(LastCMOV)),
36784 ThisMBB->end());
36785 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36786
36787 // Fallthrough block for ThisMBB.
36788 ThisMBB->addSuccessor(FalseMBB);
36789 // The true block target of the first (or only) branch is always a SinkMBB.
36790 ThisMBB->addSuccessor(SinkMBB);
36791 // Fallthrough block for FalseMBB.
36792 FalseMBB->addSuccessor(SinkMBB);
36793
36794 // Create the conditional branch instruction.
36795 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36796
36797 // SinkMBB:
36798 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36799 // ...
36800 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
36801 MachineBasicBlock::iterator MIItEnd =
36802 std::next(MachineBasicBlock::iterator(LastCMOV));
36803 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36804
36805 // Now remove the CMOV(s).
36806 ThisMBB->erase(MIItBegin, MIItEnd);
36807
36808 return SinkMBB;
36809}
36810
36811static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
36812 if (IsLP64) {
36813 if (isInt<8>(Imm))
36814 return X86::SUB64ri8;
36815 return X86::SUB64ri32;
36816 } else {
36817 if (isInt<8>(Imm))
36818 return X86::SUB32ri8;
36819 return X86::SUB32ri;
36820 }
36821}
36822
36823MachineBasicBlock *
36824X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36825 MachineBasicBlock *MBB) const {
36826 MachineFunction *MF = MBB->getParent();
36827 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36828 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36829 const DebugLoc &DL = MI.getDebugLoc();
36830 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36831
36832 const unsigned ProbeSize = getStackProbeSize(*MF);
36833
36834 MachineRegisterInfo &MRI = MF->getRegInfo();
36835 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36836 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36837 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36838
36839 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36840 MF->insert(MBBIter, testMBB);
36841 MF->insert(MBBIter, blockMBB);
36842 MF->insert(MBBIter, tailMBB);
36843
36844 Register sizeVReg = MI.getOperand(1).getReg();
36845
36846 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36847
36848 Register TmpStackPtr = MRI.createVirtualRegister(
36849 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36850 Register FinalStackPtr = MRI.createVirtualRegister(
36851 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36852
36853 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
36854 .addReg(physSPReg);
36855 {
36856 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36857 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
36858 .addReg(TmpStackPtr)
36859 .addReg(sizeVReg);
36860 }
36861
36862 // test rsp size
36863
36864 BuildMI(testMBB, DL,
36865 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36866 .addReg(FinalStackPtr)
36867 .addReg(physSPReg);
36868
36869 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
36870 .addMBB(tailMBB)
36871 .addImm(X86::COND_GE);
36872 testMBB->addSuccessor(blockMBB);
36873 testMBB->addSuccessor(tailMBB);
36874
36875 // Touch the block then extend it. This is done on the opposite side of
36876 // static probe where we allocate then touch, to avoid the need of probing the
36877 // tail of the static alloca. Possible scenarios are:
36878 //
36879 // + ---- <- ------------ <- ------------- <- ------------ +
36880 // | |
36881 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36882 // | |
36883 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36884 //
36885 // The property we want to enforce is to never have more than [page alloc] between two probes.
36886
36887 const unsigned XORMIOpc =
36888 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
36889 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
36890 .addImm(0);
36891
36892 BuildMI(blockMBB, DL,
36893 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
36894 .addReg(physSPReg)
36895 .addImm(ProbeSize);
36896
36897
36898 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
36899 blockMBB->addSuccessor(testMBB);
36900
36901 // Replace original instruction by the expected stack ptr
36902 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
36903 .addReg(FinalStackPtr);
36904
36905 tailMBB->splice(tailMBB->end(), MBB,
36906 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36907 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
36908 MBB->addSuccessor(testMBB);
36909
36910 // Delete the original pseudo instruction.
36911 MI.eraseFromParent();
36912
36913 // And we're done.
36914 return tailMBB;
36915}
36916
36917MachineBasicBlock *
36918X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36919 MachineBasicBlock *BB) const {
36920 MachineFunction *MF = BB->getParent();
36921 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36922 const DebugLoc &DL = MI.getDebugLoc();
36923 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36924
36925 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36925, __extension__ __PRETTY_FUNCTION__))
;
36926
36927 const bool Is64Bit = Subtarget.is64Bit();
36928 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36929
36930 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36931 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36932
36933 // BB:
36934 // ... [Till the alloca]
36935 // If stacklet is not large enough, jump to mallocMBB
36936 //
36937 // bumpMBB:
36938 // Allocate by subtracting from RSP
36939 // Jump to continueMBB
36940 //
36941 // mallocMBB:
36942 // Allocate by call to runtime
36943 //
36944 // continueMBB:
36945 // ...
36946 // [rest of original BB]
36947 //
36948
36949 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36950 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36951 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36952
36953 MachineRegisterInfo &MRI = MF->getRegInfo();
36954 const TargetRegisterClass *AddrRegClass =
36955 getRegClassFor(getPointerTy(MF->getDataLayout()));
36956
36957 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36958 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36959 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36960 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36961 sizeVReg = MI.getOperand(1).getReg(),
36962 physSPReg =
36963 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36964
36965 MachineFunction::iterator MBBIter = ++BB->getIterator();
36966
36967 MF->insert(MBBIter, bumpMBB);
36968 MF->insert(MBBIter, mallocMBB);
36969 MF->insert(MBBIter, continueMBB);
36970
36971 continueMBB->splice(continueMBB->begin(), BB,
36972 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36973 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36974
36975 // Add code to the main basic block to check if the stack limit has been hit,
36976 // and if so, jump to mallocMBB otherwise to bumpMBB.
36977 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36978 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36979 .addReg(tmpSPVReg).addReg(sizeVReg);
36980 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36981 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36982 .addReg(SPLimitVReg);
36983 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36984
36985 // bumpMBB simply decreases the stack pointer, since we know the current
36986 // stacklet has enough space.
36987 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
36988 .addReg(SPLimitVReg);
36989 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36990 .addReg(SPLimitVReg);
36991 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36992
36993 // Calls into a routine in libgcc to allocate more space from the heap.
36994 const uint32_t *RegMask =
36995 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36996 if (IsLP64) {
36997 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
36998 .addReg(sizeVReg);
36999 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
37000 .addExternalSymbol("__morestack_allocate_stack_space")
37001 .addRegMask(RegMask)
37002 .addReg(X86::RDI, RegState::Implicit)
37003 .addReg(X86::RAX, RegState::ImplicitDefine);
37004 } else if (Is64Bit) {
37005 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
37006 .addReg(sizeVReg);
37007 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
37008 .addExternalSymbol("__morestack_allocate_stack_space")
37009 .addRegMask(RegMask)
37010 .addReg(X86::EDI, RegState::Implicit)
37011 .addReg(X86::EAX, RegState::ImplicitDefine);
37012 } else {
37013 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
37014 .addImm(12);
37015 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
37016 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
37017 .addExternalSymbol("__morestack_allocate_stack_space")
37018 .addRegMask(RegMask)
37019 .addReg(X86::EAX, RegState::ImplicitDefine);
37020 }
37021
37022 if (!Is64Bit)
37023 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
37024 .addImm(16);
37025
37026 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
37027 .addReg(IsLP64 ? X86::RAX : X86::EAX);
37028 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
37029
37030 // Set up the CFG correctly.
37031 BB->addSuccessor(bumpMBB);
37032 BB->addSuccessor(mallocMBB);
37033 mallocMBB->addSuccessor(continueMBB);
37034 bumpMBB->addSuccessor(continueMBB);
37035
37036 // Take care of the PHI nodes.
37037 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
37038 MI.getOperand(0).getReg())
37039 .addReg(mallocPtrVReg)
37040 .addMBB(mallocMBB)
37041 .addReg(bumpSPPtrVReg)
37042 .addMBB(bumpMBB);
37043
37044 // Delete the original pseudo instruction.
37045 MI.eraseFromParent();
37046
37047 // And we're done.
37048 return continueMBB;
37049}
37050
37051MachineBasicBlock *
37052X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
37053 MachineBasicBlock *BB) const {
37054 MachineFunction *MF = BB->getParent();
37055 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37056 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
37057 const DebugLoc &DL = MI.getDebugLoc();
37058
37059 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))
37060 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))
37061 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))
;
37062
37063 // Only 32-bit EH needs to worry about manually restoring stack pointers.
37064 if (!Subtarget.is32Bit())
37065 return BB;
37066
37067 // C++ EH creates a new target block to hold the restore code, and wires up
37068 // the new block to the return destination with a normal JMP_4.
37069 MachineBasicBlock *RestoreMBB =
37070 MF->CreateMachineBasicBlock(BB->getBasicBlock());
37071 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37071, __extension__ __PRETTY_FUNCTION__))
;
37072 MF->insert(std::next(BB->getIterator()), RestoreMBB);
37073 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
37074 BB->addSuccessor(RestoreMBB);
37075 MI.getOperand(0).setMBB(RestoreMBB);
37076
37077 // Marking this as an EH pad but not a funclet entry block causes PEI to
37078 // restore stack pointers in the block.
37079 RestoreMBB->setIsEHPad(true);
37080
37081 auto RestoreMBBI = RestoreMBB->begin();
37082 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
37083 return BB;
37084}
37085
37086MachineBasicBlock *
37087X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
37088 MachineBasicBlock *BB) const {
37089 // So, here we replace TLSADDR with the sequence:
37090 // adjust_stackdown -> TLSADDR -> adjust_stackup.
37091 // We need this because TLSADDR is lowered into calls
37092 // inside MC, therefore without the two markers shrink-wrapping
37093 // may push the prologue/epilogue pass them.
37094 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37095 const DebugLoc &DL = MI.getDebugLoc();
37096 MachineFunction &MF = *BB->getParent();
37097
37098 // Emit CALLSEQ_START right before the instruction.
37099 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37100 MachineInstrBuilder CallseqStart =
37101 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37102 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37103
37104 // Emit CALLSEQ_END right after the instruction.
37105 // We don't call erase from parent because we want to keep the
37106 // original instruction around.
37107 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37108 MachineInstrBuilder CallseqEnd =
37109 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
37110 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37111
37112 return BB;
37113}
37114
37115MachineBasicBlock *
37116X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
37117 MachineBasicBlock *BB) const {
37118 // This is pretty easy. We're taking the value that we received from
37119 // our load from the relocation, sticking it in either RDI (x86-64)
37120 // or EAX and doing an indirect call. The return value will then
37121 // be in the normal return register.
37122 MachineFunction *F = BB->getParent();
37123 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37124 const DebugLoc &DL = MI.getDebugLoc();
37125
37126 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37126, __extension__
__PRETTY_FUNCTION__))
;
37127 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37127, __extension__
__PRETTY_FUNCTION__))
;
37128
37129 // Get a register mask for the lowered call.
37130 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
37131 // proper register mask.
37132 const uint32_t *RegMask =
37133 Subtarget.is64Bit() ?
37134 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
37135 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
37136 if (Subtarget.is64Bit()) {
37137 MachineInstrBuilder MIB =
37138 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
37139 .addReg(X86::RIP)
37140 .addImm(0)
37141 .addReg(0)
37142 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37143 MI.getOperand(3).getTargetFlags())
37144 .addReg(0);
37145 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
37146 addDirectMem(MIB, X86::RDI);
37147 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
37148 } else if (!isPositionIndependent()) {
37149 MachineInstrBuilder MIB =
37150 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
37151 .addReg(0)
37152 .addImm(0)
37153 .addReg(0)
37154 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37155 MI.getOperand(3).getTargetFlags())
37156 .addReg(0);
37157 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
37158 addDirectMem(MIB, X86::EAX);
37159 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
37160 } else {
37161 MachineInstrBuilder MIB =
37162 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
37163 .addReg(TII->getGlobalBaseReg(F))
37164 .addImm(0)
37165 .addReg(0)
37166 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37167 MI.getOperand(3).getTargetFlags())
37168 .addReg(0);
37169 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
37170 addDirectMem(MIB, X86::EAX);
37171 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
37172 }
37173
37174 MI.eraseFromParent(); // The pseudo instruction is gone now.
37175 return BB;
37176}
37177
37178static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
37179 switch (RPOpc) {
37180 case X86::INDIRECT_THUNK_CALL32:
37181 return X86::CALLpcrel32;
37182 case X86::INDIRECT_THUNK_CALL64:
37183 return X86::CALL64pcrel32;
37184 case X86::INDIRECT_THUNK_TCRETURN32:
37185 return X86::TCRETURNdi;
37186 case X86::INDIRECT_THUNK_TCRETURN64:
37187 return X86::TCRETURNdi64;
37188 }
37189 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37189)
;
37190}
37191
37192static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
37193 unsigned Reg) {
37194 if (Subtarget.useRetpolineExternalThunk()) {
37195 // When using an external thunk for retpolines, we pick names that match the
37196 // names GCC happens to use as well. This helps simplify the implementation
37197 // of the thunks for kernels where they have no easy ability to create
37198 // aliases and are doing non-trivial configuration of the thunk's body. For
37199 // example, the Linux kernel will do boot-time hot patching of the thunk
37200 // bodies and cannot easily export aliases of these to loaded modules.
37201 //
37202 // Note that at any point in the future, we may need to change the semantics
37203 // of how we implement retpolines and at that time will likely change the
37204 // name of the called thunk. Essentially, there is no hard guarantee that
37205 // LLVM will generate calls to specific thunks, we merely make a best-effort
37206 // attempt to help out kernels and other systems where duplicating the
37207 // thunks is costly.
37208 switch (Reg) {
37209 case X86::EAX:
37210 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37210, __extension__
__PRETTY_FUNCTION__))
;
37211 return "__x86_indirect_thunk_eax";
37212 case X86::ECX:
37213 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37213, __extension__
__PRETTY_FUNCTION__))
;
37214 return "__x86_indirect_thunk_ecx";
37215 case X86::EDX:
37216 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37216, __extension__
__PRETTY_FUNCTION__))
;
37217 return "__x86_indirect_thunk_edx";
37218 case X86::EDI:
37219 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37219, __extension__
__PRETTY_FUNCTION__))
;
37220 return "__x86_indirect_thunk_edi";
37221 case X86::R11:
37222 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37222, __extension__
__PRETTY_FUNCTION__))
;
37223 return "__x86_indirect_thunk_r11";
37224 }
37225 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37225)
;
37226 }
37227
37228 if (Subtarget.useRetpolineIndirectCalls() ||
37229 Subtarget.useRetpolineIndirectBranches()) {
37230 // When targeting an internal COMDAT thunk use an LLVM-specific name.
37231 switch (Reg) {
37232 case X86::EAX:
37233 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37233, __extension__
__PRETTY_FUNCTION__))
;
37234 return "__llvm_retpoline_eax";
37235 case X86::ECX:
37236 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37236, __extension__
__PRETTY_FUNCTION__))
;
37237 return "__llvm_retpoline_ecx";
37238 case X86::EDX:
37239 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37239, __extension__
__PRETTY_FUNCTION__))
;
37240 return "__llvm_retpoline_edx";
37241 case X86::EDI:
37242 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37242, __extension__
__PRETTY_FUNCTION__))
;
37243 return "__llvm_retpoline_edi";
37244 case X86::R11:
37245 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37245, __extension__
__PRETTY_FUNCTION__))
;
37246 return "__llvm_retpoline_r11";
37247 }
37248 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37248)
;
37249 }
37250
37251 if (Subtarget.useLVIControlFlowIntegrity()) {
37252 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37252, __extension__
__PRETTY_FUNCTION__))
;
37253 return "__llvm_lvi_thunk_r11";
37254 }
37255 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37255)
;
37256}
37257
37258MachineBasicBlock *
37259X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
37260 MachineBasicBlock *BB) const {
37261 // Copy the virtual register into the R11 physical register and
37262 // call the retpoline thunk.
37263 const DebugLoc &DL = MI.getDebugLoc();
37264 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37265 Register CalleeVReg = MI.getOperand(0).getReg();
37266 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
37267
37268 // Find an available scratch register to hold the callee. On 64-bit, we can
37269 // just use R11, but we scan for uses anyway to ensure we don't generate
37270 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
37271 // already a register use operand to the call to hold the callee. If none
37272 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
37273 // register and ESI is the base pointer to realigned stack frames with VLAs.
37274 SmallVector<unsigned, 3> AvailableRegs;
37275 if (Subtarget.is64Bit())
37276 AvailableRegs.push_back(X86::R11);
37277 else
37278 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
37279
37280 // Zero out any registers that are already used.
37281 for (const auto &MO : MI.operands()) {
37282 if (MO.isReg() && MO.isUse())
37283 for (unsigned &Reg : AvailableRegs)
37284 if (Reg == MO.getReg())
37285 Reg = 0;
37286 }
37287
37288 // Choose the first remaining non-zero available register.
37289 unsigned AvailableReg = 0;
37290 for (unsigned MaybeReg : AvailableRegs) {
37291 if (MaybeReg) {
37292 AvailableReg = MaybeReg;
37293 break;
37294 }
37295 }
37296 if (!AvailableReg)
37297 report_fatal_error("calling convention incompatible with retpoline, no "
37298 "available registers");
37299
37300 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
37301
37302 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
37303 .addReg(CalleeVReg);
37304 MI.getOperand(0).ChangeToES(Symbol);
37305 MI.setDesc(TII->get(Opc));
37306 MachineInstrBuilder(*BB->getParent(), &MI)
37307 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
37308 return BB;
37309}
37310
37311/// SetJmp implies future control flow change upon calling the corresponding
37312/// LongJmp.
37313/// Instead of using the 'return' instruction, the long jump fixes the stack and
37314/// performs an indirect branch. To do so it uses the registers that were stored
37315/// in the jump buffer (when calling SetJmp).
37316/// In case the shadow stack is enabled we need to fix it as well, because some
37317/// return addresses will be skipped.
37318/// The function will save the SSP for future fixing in the function
37319/// emitLongJmpShadowStackFix.
37320/// \sa emitLongJmpShadowStackFix
37321/// \param [in] MI The temporary Machine Instruction for the builtin.
37322/// \param [in] MBB The Machine Basic Block that will be modified.
37323void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
37324 MachineBasicBlock *MBB) const {
37325 const DebugLoc &DL = MI.getDebugLoc();
37326 MachineFunction *MF = MBB->getParent();
37327 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37328 MachineRegisterInfo &MRI = MF->getRegInfo();
37329 MachineInstrBuilder MIB;
37330
37331 // Memory Reference.
37332 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37333 MI.memoperands_end());
37334
37335 // Initialize a register with zero.
37336 MVT PVT = getPointerTy(MF->getDataLayout());
37337 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37338 Register ZReg = MRI.createVirtualRegister(PtrRC);
37339 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37340 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
37341 .addDef(ZReg)
37342 .addReg(ZReg, RegState::Undef)
37343 .addReg(ZReg, RegState::Undef);
37344
37345 // Read the current SSP Register value to the zeroed register.
37346 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37347 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37348 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37349
37350 // Write the SSP register value to offset 3 in input memory buffer.
37351 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37352 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
37353 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37354 const unsigned MemOpndSlot = 1;
37355 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37356 if (i == X86::AddrDisp)
37357 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37358 else
37359 MIB.add(MI.getOperand(MemOpndSlot + i));
37360 }
37361 MIB.addReg(SSPCopyReg);
37362 MIB.setMemRefs(MMOs);
37363}
37364
37365MachineBasicBlock *
37366X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37367 MachineBasicBlock *MBB) const {
37368 const DebugLoc &DL = MI.getDebugLoc();
37369 MachineFunction *MF = MBB->getParent();
37370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37371 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37372 MachineRegisterInfo &MRI = MF->getRegInfo();
37373
37374 const BasicBlock *BB = MBB->getBasicBlock();
37375 MachineFunction::iterator I = ++MBB->getIterator();
37376
37377 // Memory Reference
37378 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37379 MI.memoperands_end());
37380
37381 unsigned DstReg;
37382 unsigned MemOpndSlot = 0;
37383
37384 unsigned CurOp = 0;
37385
37386 DstReg = MI.getOperand(CurOp++).getReg();
37387 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37388 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37388, __extension__
__PRETTY_FUNCTION__))
;
37389 (void)TRI;
37390 Register mainDstReg = MRI.createVirtualRegister(RC);
37391 Register restoreDstReg = MRI.createVirtualRegister(RC);
37392
37393 MemOpndSlot = CurOp;
37394
37395 MVT PVT = getPointerTy(MF->getDataLayout());
37396 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__))
37397 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__))
;
37398
37399 // For v = setjmp(buf), we generate
37400 //
37401 // thisMBB:
37402 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37403 // SjLjSetup restoreMBB
37404 //
37405 // mainMBB:
37406 // v_main = 0
37407 //
37408 // sinkMBB:
37409 // v = phi(main, restore)
37410 //
37411 // restoreMBB:
37412 // if base pointer being used, load it from frame
37413 // v_restore = 1
37414
37415 MachineBasicBlock *thisMBB = MBB;
37416 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37417 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37418 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37419 MF->insert(I, mainMBB);
37420 MF->insert(I, sinkMBB);
37421 MF->push_back(restoreMBB);
37422 restoreMBB->setMachineBlockAddressTaken();
37423
37424 MachineInstrBuilder MIB;
37425
37426 // Transfer the remainder of BB and its successor edges to sinkMBB.
37427 sinkMBB->splice(sinkMBB->begin(), MBB,
37428 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37429 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37430
37431 // thisMBB:
37432 unsigned PtrStoreOpc = 0;
37433 unsigned LabelReg = 0;
37434 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37435 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37436 !isPositionIndependent();
37437
37438 // Prepare IP either in reg or imm.
37439 if (!UseImmLabel) {
37440 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37441 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37442 LabelReg = MRI.createVirtualRegister(PtrRC);
37443 if (Subtarget.is64Bit()) {
37444 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
37445 .addReg(X86::RIP)
37446 .addImm(0)
37447 .addReg(0)
37448 .addMBB(restoreMBB)
37449 .addReg(0);
37450 } else {
37451 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37452 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
37453 .addReg(XII->getGlobalBaseReg(MF))
37454 .addImm(0)
37455 .addReg(0)
37456 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37457 .addReg(0);
37458 }
37459 } else
37460 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37461 // Store IP
37462 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
37463 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37464 if (i == X86::AddrDisp)
37465 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37466 else
37467 MIB.add(MI.getOperand(MemOpndSlot + i));
37468 }
37469 if (!UseImmLabel)
37470 MIB.addReg(LabelReg);
37471 else
37472 MIB.addMBB(restoreMBB);
37473 MIB.setMemRefs(MMOs);
37474
37475 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37476 emitSetJmpShadowStackFix(MI, thisMBB);
37477 }
37478
37479 // Setup
37480 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
37481 .addMBB(restoreMBB);
37482
37483 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37484 MIB.addRegMask(RegInfo->getNoPreservedMask());
37485 thisMBB->addSuccessor(mainMBB);
37486 thisMBB->addSuccessor(restoreMBB);
37487
37488 // mainMBB:
37489 // EAX = 0
37490 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
37491 mainMBB->addSuccessor(sinkMBB);
37492
37493 // sinkMBB:
37494 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
37495 TII->get(X86::PHI), DstReg)
37496 .addReg(mainDstReg).addMBB(mainMBB)
37497 .addReg(restoreDstReg).addMBB(restoreMBB);
37498
37499 // restoreMBB:
37500 if (RegInfo->hasBasePointer(*MF)) {
37501 const bool Uses64BitFramePtr =
37502 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37503 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37504 X86FI->setRestoreBasePointer(MF);
37505 Register FramePtr = RegInfo->getFrameRegister(*MF);
37506 Register BasePtr = RegInfo->getBaseRegister();
37507 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37508 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
37509 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37510 .setMIFlag(MachineInstr::FrameSetup);
37511 }
37512 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37513 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37514 restoreMBB->addSuccessor(sinkMBB);
37515
37516 MI.eraseFromParent();
37517 return sinkMBB;
37518}
37519
37520/// Fix the shadow stack using the previously saved SSP pointer.
37521/// \sa emitSetJmpShadowStackFix
37522/// \param [in] MI The temporary Machine Instruction for the builtin.
37523/// \param [in] MBB The Machine Basic Block that will be modified.
37524/// \return The sink MBB that will perform the future indirect branch.
37525MachineBasicBlock *
37526X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37527 MachineBasicBlock *MBB) const {
37528 const DebugLoc &DL = MI.getDebugLoc();
37529 MachineFunction *MF = MBB->getParent();
37530 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37531 MachineRegisterInfo &MRI = MF->getRegInfo();
37532
37533 // Memory Reference
37534 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37535 MI.memoperands_end());
37536
37537 MVT PVT = getPointerTy(MF->getDataLayout());
37538 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37539
37540 // checkSspMBB:
37541 // xor vreg1, vreg1
37542 // rdssp vreg1
37543 // test vreg1, vreg1
37544 // je sinkMBB # Jump if Shadow Stack is not supported
37545 // fallMBB:
37546 // mov buf+24/12(%rip), vreg2
37547 // sub vreg1, vreg2
37548 // jbe sinkMBB # No need to fix the Shadow Stack
37549 // fixShadowMBB:
37550 // shr 3/2, vreg2
37551 // incssp vreg2 # fix the SSP according to the lower 8 bits
37552 // shr 8, vreg2
37553 // je sinkMBB
37554 // fixShadowLoopPrepareMBB:
37555 // shl vreg2
37556 // mov 128, vreg3
37557 // fixShadowLoopMBB:
37558 // incssp vreg3
37559 // dec vreg2
37560 // jne fixShadowLoopMBB # Iterate until you finish fixing
37561 // # the Shadow Stack
37562 // sinkMBB:
37563
37564 MachineFunction::iterator I = ++MBB->getIterator();
37565 const BasicBlock *BB = MBB->getBasicBlock();
37566
37567 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37568 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37569 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37570 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37571 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37572 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37573 MF->insert(I, checkSspMBB);
37574 MF->insert(I, fallMBB);
37575 MF->insert(I, fixShadowMBB);
37576 MF->insert(I, fixShadowLoopPrepareMBB);
37577 MF->insert(I, fixShadowLoopMBB);
37578 MF->insert(I, sinkMBB);
37579
37580 // Transfer the remainder of BB and its successor edges to sinkMBB.
37581 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37582 MBB->end());
37583 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37584
37585 MBB->addSuccessor(checkSspMBB);
37586
37587 // Initialize a register with zero.
37588 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37589 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
37590
37591 if (PVT == MVT::i64) {
37592 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37593 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37594 .addImm(0)
37595 .addReg(ZReg)
37596 .addImm(X86::sub_32bit);
37597 ZReg = TmpZReg;
37598 }
37599
37600 // Read the current SSP Register value to the zeroed register.
37601 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37602 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37603 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37604
37605 // Check whether the result of the SSP register is zero and jump directly
37606 // to the sink.
37607 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37608 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
37609 .addReg(SSPCopyReg)
37610 .addReg(SSPCopyReg);
37611 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37612 checkSspMBB->addSuccessor(sinkMBB);
37613 checkSspMBB->addSuccessor(fallMBB);
37614
37615 // Reload the previously saved SSP register value.
37616 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37617 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37618 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37619 MachineInstrBuilder MIB =
37620 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
37621 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37622 const MachineOperand &MO = MI.getOperand(i);
37623 if (i == X86::AddrDisp)
37624 MIB.addDisp(MO, SPPOffset);
37625 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37626 // preserve kill flags.
37627 MIB.addReg(MO.getReg());
37628 else
37629 MIB.add(MO);
37630 }
37631 MIB.setMemRefs(MMOs);
37632
37633 // Subtract the current SSP from the previous SSP.
37634 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37635 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37636 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
37637 .addReg(PrevSSPReg)
37638 .addReg(SSPCopyReg);
37639
37640 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37641 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
37642 fallMBB->addSuccessor(sinkMBB);
37643 fallMBB->addSuccessor(fixShadowMBB);
37644
37645 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37646 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37647 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37648 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37649 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
37650 .addReg(SspSubReg)
37651 .addImm(Offset);
37652
37653 // Increase SSP when looking only on the lower 8 bits of the delta.
37654 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37655 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37656
37657 // Reset the lower 8 bits.
37658 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37659 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
37660 .addReg(SspFirstShrReg)
37661 .addImm(8);
37662
37663 // Jump if the result of the shift is zero.
37664 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37665 fixShadowMBB->addSuccessor(sinkMBB);
37666 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37667
37668 // Do a single shift left.
37669 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
37670 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37671 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
37672 .addReg(SspSecondShrReg);
37673
37674 // Save the value 128 to a register (will be used next with incssp).
37675 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37676 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37677 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
37678 .addImm(128);
37679 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37680
37681 // Since incssp only looks at the lower 8 bits, we might need to do several
37682 // iterations of incssp until we finish fixing the shadow stack.
37683 Register DecReg = MRI.createVirtualRegister(PtrRC);
37684 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37685 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
37686 .addReg(SspAfterShlReg)
37687 .addMBB(fixShadowLoopPrepareMBB)
37688 .addReg(DecReg)
37689 .addMBB(fixShadowLoopMBB);
37690
37691 // Every iteration we increase the SSP by 128.
37692 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
37693
37694 // Every iteration we decrement the counter by 1.
37695 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37696 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
37697
37698 // Jump if the counter is not zero yet.
37699 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
37700 fixShadowLoopMBB->addSuccessor(sinkMBB);
37701 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37702
37703 return sinkMBB;
37704}
37705
37706MachineBasicBlock *
37707X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37708 MachineBasicBlock *MBB) const {
37709 const DebugLoc &DL = MI.getDebugLoc();
37710 MachineFunction *MF = MBB->getParent();
37711 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37712 MachineRegisterInfo &MRI = MF->getRegInfo();
37713
37714 // Memory Reference
37715 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37716 MI.memoperands_end());
37717
37718 MVT PVT = getPointerTy(MF->getDataLayout());
37719 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__
__PRETTY_FUNCTION__))
37720 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__
__PRETTY_FUNCTION__))
;
37721
37722 const TargetRegisterClass *RC =
37723 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37724 Register Tmp = MRI.createVirtualRegister(RC);
37725 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37726 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37727 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37728 Register SP = RegInfo->getStackRegister();
37729
37730 MachineInstrBuilder MIB;
37731
37732 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37733 const int64_t SPOffset = 2 * PVT.getStoreSize();
37734
37735 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37736 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37737
37738 MachineBasicBlock *thisMBB = MBB;
37739
37740 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37741 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37742 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37743 }
37744
37745 // Reload FP
37746 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
37747 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37748 const MachineOperand &MO = MI.getOperand(i);
37749 if (MO.isReg()) // Don't add the whole operand, we don't want to
37750 // preserve kill flags.
37751 MIB.addReg(MO.getReg());
37752 else
37753 MIB.add(MO);
37754 }
37755 MIB.setMemRefs(MMOs);
37756
37757 // Reload IP
37758 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
37759 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37760 const MachineOperand &MO = MI.getOperand(i);
37761 if (i == X86::AddrDisp)
37762 MIB.addDisp(MO, LabelOffset);
37763 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37764 // preserve kill flags.
37765 MIB.addReg(MO.getReg());
37766 else
37767 MIB.add(MO);
37768 }
37769 MIB.setMemRefs(MMOs);
37770
37771 // Reload SP
37772 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
37773 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37774 if (i == X86::AddrDisp)
37775 MIB.addDisp(MI.getOperand(i), SPOffset);
37776 else
37777 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37778 // the last instruction of the expansion.
37779 }
37780 MIB.setMemRefs(MMOs);
37781
37782 // Jump
37783 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
37784
37785 MI.eraseFromParent();
37786 return thisMBB;
37787}
37788
37789void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37790 MachineBasicBlock *MBB,
37791 MachineBasicBlock *DispatchBB,
37792 int FI) const {
37793 const DebugLoc &DL = MI.getDebugLoc();
37794 MachineFunction *MF = MBB->getParent();
37795 MachineRegisterInfo *MRI = &MF->getRegInfo();
37796 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37797
37798 MVT PVT = getPointerTy(MF->getDataLayout());
37799 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37799, __extension__
__PRETTY_FUNCTION__))
;
37800
37801 unsigned Op = 0;
37802 unsigned VR = 0;
37803
37804 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37805 !isPositionIndependent();
37806
37807 if (UseImmLabel) {
37808 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37809 } else {
37810 const TargetRegisterClass *TRC =
37811 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37812 VR = MRI->createVirtualRegister(TRC);
37813 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37814
37815 if (Subtarget.is64Bit())
37816 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
37817 .addReg(X86::RIP)
37818 .addImm(1)
37819 .addReg(0)
37820 .addMBB(DispatchBB)
37821 .addReg(0);
37822 else
37823 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
37824 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37825 .addImm(1)
37826 .addReg(0)
37827 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37828 .addReg(0);
37829 }
37830
37831 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
37832 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37833 if (UseImmLabel)
37834 MIB.addMBB(DispatchBB);
37835 else
37836 MIB.addReg(VR);
37837}
37838
37839MachineBasicBlock *
37840X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37841 MachineBasicBlock *BB) const {
37842 const DebugLoc &DL = MI.getDebugLoc();
37843 MachineFunction *MF = BB->getParent();
37844 MachineRegisterInfo *MRI = &MF->getRegInfo();
37845 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37846 int FI = MF->getFrameInfo().getFunctionContextIndex();
37847
37848 // Get a mapping of the call site numbers to all of the landing pads they're
37849 // associated with.
37850 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37851 unsigned MaxCSNum = 0;
37852 for (auto &MBB : *MF) {
37853 if (!MBB.isEHPad())
37854 continue;
37855
37856 MCSymbol *Sym = nullptr;
37857 for (const auto &MI : MBB) {
37858 if (MI.isDebugInstr())
37859 continue;
37860
37861 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37861, __extension__
__PRETTY_FUNCTION__))
;
37862 Sym = MI.getOperand(0).getMCSymbol();
37863 break;
37864 }
37865
37866 if (!MF->hasCallSiteLandingPad(Sym))
37867 continue;
37868
37869 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37870 CallSiteNumToLPad[CSI].push_back(&MBB);
37871 MaxCSNum = std::max(MaxCSNum, CSI);
37872 }
37873 }
37874
37875 // Get an ordered list of the machine basic blocks for the jump table.
37876 std::vector<MachineBasicBlock *> LPadList;
37877 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37878 LPadList.reserve(CallSiteNumToLPad.size());
37879
37880 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37881 for (auto &LP : CallSiteNumToLPad[CSI]) {
37882 LPadList.push_back(LP);
37883 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37884 }
37885 }
37886
37887 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__
__PRETTY_FUNCTION__))
37888 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__
__PRETTY_FUNCTION__))
;
37889
37890 // Create the MBBs for the dispatch code.
37891
37892 // Shove the dispatch's address into the return slot in the function context.
37893 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37894 DispatchBB->setIsEHPad(true);
37895
37896 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37897 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
37898 DispatchBB->addSuccessor(TrapBB);
37899
37900 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37901 DispatchBB->addSuccessor(DispContBB);
37902
37903 // Insert MBBs.
37904 MF->push_back(DispatchBB);
37905 MF->push_back(DispContBB);
37906 MF->push_back(TrapBB);
37907
37908 // Insert code into the entry block that creates and registers the function
37909 // context.
37910 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37911
37912 // Create the jump table and associated information
37913 unsigned JTE = getJumpTableEncoding();
37914 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37915 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37916
37917 const X86RegisterInfo &RI = TII->getRegisterInfo();
37918 // Add a register mask with no preserved registers. This results in all
37919 // registers being marked as clobbered.
37920 if (RI.hasBasePointer(*MF)) {
37921 const bool FPIs64Bit =
37922 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37923 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37924 MFI->setRestoreBasePointer(MF);
37925
37926 Register FP = RI.getFrameRegister(*MF);
37927 Register BP = RI.getBaseRegister();
37928 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37929 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
37930 MFI->getRestoreBasePointerOffset())
37931 .addRegMask(RI.getNoPreservedMask());
37932 } else {
37933 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
37934 .addRegMask(RI.getNoPreservedMask());
37935 }
37936
37937 // IReg is used as an index in a memory operand and therefore can't be SP
37938 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37939 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
37940 Subtarget.is64Bit() ? 8 : 4);
37941 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
37942 .addReg(IReg)
37943 .addImm(LPadList.size());
37944 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
37945
37946 if (Subtarget.is64Bit()) {
37947 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37948 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37949
37950 // leaq .LJTI0_0(%rip), BReg
37951 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
37952 .addReg(X86::RIP)
37953 .addImm(1)
37954 .addReg(0)
37955 .addJumpTableIndex(MJTI)
37956 .addReg(0);
37957 // movzx IReg64, IReg
37958 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37959 .addImm(0)
37960 .addReg(IReg)
37961 .addImm(X86::sub_32bit);
37962
37963 switch (JTE) {
37964 case MachineJumpTableInfo::EK_BlockAddress:
37965 // jmpq *(BReg,IReg64,8)
37966 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
37967 .addReg(BReg)
37968 .addImm(8)
37969 .addReg(IReg64)
37970 .addImm(0)
37971 .addReg(0);
37972 break;
37973 case MachineJumpTableInfo::EK_LabelDifference32: {
37974 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37975 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37976 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37977
37978 // movl (BReg,IReg64,4), OReg
37979 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
37980 .addReg(BReg)
37981 .addImm(4)
37982 .addReg(IReg64)
37983 .addImm(0)
37984 .addReg(0);
37985 // movsx OReg64, OReg
37986 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
37987 // addq BReg, OReg64, TReg
37988 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
37989 .addReg(OReg64)
37990 .addReg(BReg);
37991 // jmpq *TReg
37992 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
37993 break;
37994 }
37995 default:
37996 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37996)
;
37997 }
37998 } else {
37999 // jmpl *.LJTI0_0(,IReg,4)
38000 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
38001 .addReg(0)
38002 .addImm(4)
38003 .addReg(IReg)
38004 .addJumpTableIndex(MJTI)
38005 .addReg(0);
38006 }
38007
38008 // Add the jump table entries as successors to the MBB.
38009 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
38010 for (auto &LP : LPadList)
38011 if (SeenMBBs.insert(LP).second)
38012 DispContBB->addSuccessor(LP);
38013
38014 // N.B. the order the invoke BBs are processed in doesn't matter here.
38015 SmallVector<MachineBasicBlock *, 64> MBBLPads;
38016 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
38017 for (MachineBasicBlock *MBB : InvokeBBs) {
38018 // Remove the landing pad successor from the invoke block and replace it
38019 // with the new dispatch block.
38020 // Keep a copy of Successors since it's modified inside the loop.
38021 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
38022 MBB->succ_rend());
38023 // FIXME: Avoid quadratic complexity.
38024 for (auto *MBBS : Successors) {
38025 if (MBBS->isEHPad()) {
38026 MBB->removeSuccessor(MBBS);
38027 MBBLPads.push_back(MBBS);
38028 }
38029 }
38030
38031 MBB->addSuccessor(DispatchBB);
38032
38033 // Find the invoke call and mark all of the callee-saved registers as
38034 // 'implicit defined' so that they're spilled. This prevents code from
38035 // moving instructions to before the EH block, where they will never be
38036 // executed.
38037 for (auto &II : reverse(*MBB)) {
38038 if (!II.isCall())
38039 continue;
38040
38041 DenseMap<unsigned, bool> DefRegs;
38042 for (auto &MOp : II.operands())
38043 if (MOp.isReg())
38044 DefRegs[MOp.getReg()] = true;
38045
38046 MachineInstrBuilder MIB(*MF, &II);
38047 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
38048 unsigned Reg = SavedRegs[RegIdx];
38049 if (!DefRegs[Reg])
38050 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
38051 }
38052
38053 break;
38054 }
38055 }
38056
38057 // Mark all former landing pads as non-landing pads. The dispatch is the only
38058 // landing pad now.
38059 for (auto &LP : MBBLPads)
38060 LP->setIsEHPad(false);
38061
38062 // The instruction is gone now.
38063 MI.eraseFromParent();
38064 return BB;
38065}
38066
38067MachineBasicBlock *
38068X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
38069 MachineBasicBlock *BB) const {
38070 MachineFunction *MF = BB->getParent();
38071 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38072 const DebugLoc &DL = MI.getDebugLoc();
38073
38074 auto TMMImmToTMMReg = [](unsigned Imm) {
38075 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38075, __extension__
__PRETTY_FUNCTION__))
;
38076 return X86::TMM0 + Imm;
38077 };
38078 switch (MI.getOpcode()) {
38079 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38079)
;
38080 case X86::TLS_addr32:
38081 case X86::TLS_addr64:
38082 case X86::TLS_addrX32:
38083 case X86::TLS_base_addr32:
38084 case X86::TLS_base_addr64:
38085 case X86::TLS_base_addrX32:
38086 return EmitLoweredTLSAddr(MI, BB);
38087 case X86::INDIRECT_THUNK_CALL32:
38088 case X86::INDIRECT_THUNK_CALL64:
38089 case X86::INDIRECT_THUNK_TCRETURN32:
38090 case X86::INDIRECT_THUNK_TCRETURN64:
38091 return EmitLoweredIndirectThunk(MI, BB);
38092 case X86::CATCHRET:
38093 return EmitLoweredCatchRet(MI, BB);
38094 case X86::SEG_ALLOCA_32:
38095 case X86::SEG_ALLOCA_64:
38096 return EmitLoweredSegAlloca(MI, BB);
38097 case X86::PROBED_ALLOCA_32:
38098 case X86::PROBED_ALLOCA_64:
38099 return EmitLoweredProbedAlloca(MI, BB);
38100 case X86::TLSCall_32:
38101 case X86::TLSCall_64:
38102 return EmitLoweredTLSCall(MI, BB);
38103 case X86::CMOV_FR16:
38104 case X86::CMOV_FR16X:
38105 case X86::CMOV_FR32:
38106 case X86::CMOV_FR32X:
38107 case X86::CMOV_FR64:
38108 case X86::CMOV_FR64X:
38109 case X86::CMOV_GR8:
38110 case X86::CMOV_GR16:
38111 case X86::CMOV_GR32:
38112 case X86::CMOV_RFP32:
38113 case X86::CMOV_RFP64:
38114 case X86::CMOV_RFP80:
38115 case X86::CMOV_VR64:
38116 case X86::CMOV_VR128:
38117 case X86::CMOV_VR128X:
38118 case X86::CMOV_VR256:
38119 case X86::CMOV_VR256X:
38120 case X86::CMOV_VR512:
38121 case X86::CMOV_VK1:
38122 case X86::CMOV_VK2:
38123 case X86::CMOV_VK4:
38124 case X86::CMOV_VK8:
38125 case X86::CMOV_VK16:
38126 case X86::CMOV_VK32:
38127 case X86::CMOV_VK64:
38128 return EmitLoweredSelect(MI, BB);
38129
38130 case X86::FP80_ADDr:
38131 case X86::FP80_ADDm32: {
38132 // Change the floating point control register to use double extended
38133 // precision when performing the addition.
38134 int OrigCWFrameIdx =
38135 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38136 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
38137 OrigCWFrameIdx);
38138
38139 // Load the old value of the control word...
38140 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38141 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
38142 OrigCWFrameIdx);
38143
38144 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
38145 // precision.
38146 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38147 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
38148 .addReg(OldCW, RegState::Kill)
38149 .addImm(0x300);
38150
38151 // Extract to 16 bits.
38152 Register NewCW16 =
38153 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
38154 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
38155 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
38156
38157 // Prepare memory for FLDCW.
38158 int NewCWFrameIdx =
38159 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38160 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
38161 NewCWFrameIdx)
38162 .addReg(NewCW16, RegState::Kill);
38163
38164 // Reload the modified control word now...
38165 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
38166 NewCWFrameIdx);
38167
38168 // Do the addition.
38169 if (MI.getOpcode() == X86::FP80_ADDr) {
38170 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
38171 .add(MI.getOperand(0))
38172 .add(MI.getOperand(1))
38173 .add(MI.getOperand(2));
38174 } else {
38175 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
38176 .add(MI.getOperand(0))
38177 .add(MI.getOperand(1))
38178 .add(MI.getOperand(2))
38179 .add(MI.getOperand(3))
38180 .add(MI.getOperand(4))
38181 .add(MI.getOperand(5))
38182 .add(MI.getOperand(6));
38183 }
38184
38185 // Reload the original control word now.
38186 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
38187 OrigCWFrameIdx);
38188
38189 MI.eraseFromParent(); // The pseudo instruction is gone now.
38190 return BB;
38191 }
38192
38193 case X86::FP32_TO_INT16_IN_MEM:
38194 case X86::FP32_TO_INT32_IN_MEM:
38195 case X86::FP32_TO_INT64_IN_MEM:
38196 case X86::FP64_TO_INT16_IN_MEM:
38197 case X86::FP64_TO_INT32_IN_MEM:
38198 case X86::FP64_TO_INT64_IN_MEM:
38199 case X86::FP80_TO_INT16_IN_MEM:
38200 case X86::FP80_TO_INT32_IN_MEM:
38201 case X86::FP80_TO_INT64_IN_MEM: {
38202 // Change the floating point control register to use "round towards zero"
38203 // mode when truncating to an integer value.
38204 int OrigCWFrameIdx =
38205 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38206 addFrameReference(BuildMI(*BB, MI, DL,
38207 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
38208
38209 // Load the old value of the control word...
38210 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38211 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
38212 OrigCWFrameIdx);
38213
38214 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
38215 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38216 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
38217 .addReg(OldCW, RegState::Kill).addImm(0xC00);
38218
38219 // Extract to 16 bits.
38220 Register NewCW16 =
38221 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
38222 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
38223 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
38224
38225 // Prepare memory for FLDCW.
38226 int NewCWFrameIdx =
38227 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38228 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
38229 NewCWFrameIdx)
38230 .addReg(NewCW16, RegState::Kill);
38231
38232 // Reload the modified control word now...
38233 addFrameReference(BuildMI(*BB, MI, DL,
38234 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
38235
38236 // Get the X86 opcode to use.
38237 unsigned Opc;
38238 switch (MI.getOpcode()) {
38239 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38239)
;
38240 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
38241 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
38242 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
38243 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
38244 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
38245 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
38246 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
38247 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
38248 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
38249 }
38250
38251 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38252 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
38253 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
38254
38255 // Reload the original control word now.
38256 addFrameReference(BuildMI(*BB, MI, DL,
38257 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
38258
38259 MI.eraseFromParent(); // The pseudo instruction is gone now.
38260 return BB;
38261 }
38262
38263 // xbegin
38264 case X86::XBEGIN:
38265 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
38266
38267 case X86::VAARG_64:
38268 case X86::VAARG_X32:
38269 return EmitVAARGWithCustomInserter(MI, BB);
38270
38271 case X86::EH_SjLj_SetJmp32:
38272 case X86::EH_SjLj_SetJmp64:
38273 return emitEHSjLjSetJmp(MI, BB);
38274
38275 case X86::EH_SjLj_LongJmp32:
38276 case X86::EH_SjLj_LongJmp64:
38277 return emitEHSjLjLongJmp(MI, BB);
38278
38279 case X86::Int_eh_sjlj_setup_dispatch:
38280 return EmitSjLjDispatchBlock(MI, BB);
38281
38282 case TargetOpcode::STATEPOINT:
38283 // As an implementation detail, STATEPOINT shares the STACKMAP format at
38284 // this point in the process. We diverge later.
38285 return emitPatchPoint(MI, BB);
38286
38287 case TargetOpcode::STACKMAP:
38288 case TargetOpcode::PATCHPOINT:
38289 return emitPatchPoint(MI, BB);
38290
38291 case TargetOpcode::PATCHABLE_EVENT_CALL:
38292 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
38293 return BB;
38294
38295 case X86::LCMPXCHG8B: {
38296 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38297 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
38298 // requires a memory operand. If it happens that current architecture is
38299 // i686 and for current function we need a base pointer
38300 // - which is ESI for i686 - register allocator would not be able to
38301 // allocate registers for an address in form of X(%reg, %reg, Y)
38302 // - there never would be enough unreserved registers during regalloc
38303 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38304 // We are giving a hand to register allocator by precomputing the address in
38305 // a new vreg using LEA.
38306
38307 // If it is not i686 or there is no base pointer - nothing to do here.
38308 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38309 return BB;
38310
38311 // Even though this code does not necessarily needs the base pointer to
38312 // be ESI, we check for that. The reason: if this assert fails, there are
38313 // some changes happened in the compiler base pointer handling, which most
38314 // probably have to be addressed somehow here.
38315 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))
38316 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))
38317 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))
;
38318
38319 MachineRegisterInfo &MRI = MF->getRegInfo();
38320 MVT SPTy = getPointerTy(MF->getDataLayout());
38321 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38322 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38323
38324 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38325 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38326 // does not use index register.
38327 if (AM.IndexReg == X86::NoRegister)
38328 return BB;
38329
38330 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38331 // four operand definitions that are E[ABCD] registers. We skip them and
38332 // then insert the LEA.
38333 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38334 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
38335 RMBBI->definesRegister(X86::EBX) ||
38336 RMBBI->definesRegister(X86::ECX) ||
38337 RMBBI->definesRegister(X86::EDX))) {
38338 ++RMBBI;
38339 }
38340 MachineBasicBlock::iterator MBBI(RMBBI);
38341 addFullAddress(
38342 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
38343
38344 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38345
38346 return BB;
38347 }
38348 case X86::LCMPXCHG16B_NO_RBX: {
38349 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38350 Register BasePtr = TRI->getBaseRegister();
38351 if (TRI->hasBasePointer(*MF) &&
38352 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38353 if (!BB->isLiveIn(BasePtr))
38354 BB->addLiveIn(BasePtr);
38355 // Save RBX into a virtual register.
38356 Register SaveRBX =
38357 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38358 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38359 .addReg(X86::RBX);
38360 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38361 MachineInstrBuilder MIB =
38362 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38363 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38364 MIB.add(MI.getOperand(Idx));
38365 MIB.add(MI.getOperand(X86::AddrNumOperands));
38366 MIB.addReg(SaveRBX);
38367 } else {
38368 // Simple case, just copy the virtual register to RBX.
38369 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
38370 .add(MI.getOperand(X86::AddrNumOperands));
38371 MachineInstrBuilder MIB =
38372 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
38373 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38374 MIB.add(MI.getOperand(Idx));
38375 }
38376 MI.eraseFromParent();
38377 return BB;
38378 }
38379 case X86::MWAITX: {
38380 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38381 Register BasePtr = TRI->getBaseRegister();
38382 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38383 // If no need to save the base pointer, we generate MWAITXrrr,
38384 // else we generate pseudo MWAITX_SAVE_RBX.
38385 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38386 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38387 .addReg(MI.getOperand(0).getReg());
38388 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38389 .addReg(MI.getOperand(1).getReg());
38390 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
38391 .addReg(MI.getOperand(2).getReg());
38392 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
38393 MI.eraseFromParent();
38394 } else {
38395 if (!BB->isLiveIn(BasePtr)) {
38396 BB->addLiveIn(BasePtr);
38397 }
38398 // Parameters can be copied into ECX and EAX but not EBX yet.
38399 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38400 .addReg(MI.getOperand(0).getReg());
38401 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38402 .addReg(MI.getOperand(1).getReg());
38403 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38403, __extension__
__PRETTY_FUNCTION__))
;
38404 // Save RBX into a virtual register.
38405 Register SaveRBX =
38406 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38407 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38408 .addReg(X86::RBX);
38409 // Generate mwaitx pseudo.
38410 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38411 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
38412 .addDef(Dst) // Destination tied in with SaveRBX.
38413 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38414 .addUse(SaveRBX); // Save of base pointer.
38415 MI.eraseFromParent();
38416 }
38417 return BB;
38418 }
38419 case TargetOpcode::PREALLOCATED_SETUP: {
38420 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38420, __extension__
__PRETTY_FUNCTION__))
;
38421 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38422 MFI->setHasPreallocatedCall(true);
38423 int64_t PreallocatedId = MI.getOperand(0).getImm();
38424 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38425 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38425, __extension__
__PRETTY_FUNCTION__))
;
38426 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
38427 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
38428 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
38429 .addReg(X86::ESP)
38430 .addImm(StackAdjustment);
38431 MI.eraseFromParent();
38432 return BB;
38433 }
38434 case TargetOpcode::PREALLOCATED_ARG: {
38435 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38435, __extension__
__PRETTY_FUNCTION__))
;
38436 int64_t PreallocatedId = MI.getOperand(1).getImm();
38437 int64_t ArgIdx = MI.getOperand(2).getImm();
38438 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38439 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38440 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
38441 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
38442 // stack pointer + offset
38443 addRegOffset(
38444 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
38445 X86::ESP, false, ArgOffset);
38446 MI.eraseFromParent();
38447 return BB;
38448 }
38449 case X86::PTDPBSSD:
38450 case X86::PTDPBSUD:
38451 case X86::PTDPBUSD:
38452 case X86::PTDPBUUD:
38453 case X86::PTDPBF16PS:
38454 case X86::PTDPFP16PS: {
38455 unsigned Opc;
38456 switch (MI.getOpcode()) {
38457 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38457)
;
38458 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38459 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38460 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38461 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38462 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38463 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38464 }
38465
38466 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38467 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38468 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38469 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38470 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38471
38472 MI.eraseFromParent(); // The pseudo is gone now.
38473 return BB;
38474 }
38475 case X86::PTILEZERO: {
38476 unsigned Imm = MI.getOperand(0).getImm();
38477 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38478 MI.eraseFromParent(); // The pseudo is gone now.
38479 return BB;
38480 }
38481 case X86::PTILELOADD:
38482 case X86::PTILELOADDT1:
38483 case X86::PTILESTORED: {
38484 unsigned Opc;
38485 switch (MI.getOpcode()) {
38486 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38486)
;
38487 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
38488 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
38489 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
38490 }
38491
38492 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38493 unsigned CurOp = 0;
38494 if (Opc != X86::TILESTORED)
38495 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38496 RegState::Define);
38497
38498 MIB.add(MI.getOperand(CurOp++)); // base
38499 MIB.add(MI.getOperand(CurOp++)); // scale
38500 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38501 MIB.add(MI.getOperand(CurOp++)); // displacement
38502 MIB.add(MI.getOperand(CurOp++)); // segment
38503
38504 if (Opc == X86::TILESTORED)
38505 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38506 RegState::Undef);
38507
38508 MI.eraseFromParent(); // The pseudo is gone now.
38509 return BB;
38510 }
38511 case X86::PTCMMIMFP16PS:
38512 case X86::PTCMMRLFP16PS: {
38513 const DebugLoc &DL = MI.getDebugLoc();
38514 unsigned Opc;
38515 switch (MI.getOpcode()) {
38516 default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38516)
;
38517 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
38518 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
38519 }
38520 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38521 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38522 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38523 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38524 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38525 MI.eraseFromParent(); // The pseudo is gone now.
38526 return BB;
38527 }
38528 }
38529}
38530
38531//===----------------------------------------------------------------------===//
38532// X86 Optimization Hooks
38533//===----------------------------------------------------------------------===//
38534
38535bool
38536X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
38537 const APInt &DemandedBits,
38538 const APInt &DemandedElts,
38539 TargetLoweringOpt &TLO) const {
38540 EVT VT = Op.getValueType();
38541 unsigned Opcode = Op.getOpcode();
38542 unsigned EltSize = VT.getScalarSizeInBits();
38543
38544 if (VT.isVector()) {
38545 // If the constant is only all signbits in the active bits, then we should
38546 // extend it to the entire constant to allow it act as a boolean constant
38547 // vector.
38548 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38549 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38550 return false;
38551 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38552 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38553 continue;
38554 const APInt &Val = V.getConstantOperandAPInt(i);
38555 if (Val.getBitWidth() > Val.getNumSignBits() &&
38556 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38557 return true;
38558 }
38559 return false;
38560 };
38561 // For vectors - if we have a constant, then try to sign extend.
38562 // TODO: Handle AND/ANDN cases.
38563 unsigned ActiveBits = DemandedBits.getActiveBits();
38564 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38565 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
38566 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38567 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38568 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38569 VT.getVectorNumElements());
38570 SDValue NewC =
38571 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
38572 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38573 SDValue NewOp =
38574 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38575 return TLO.CombineTo(Op, NewOp);
38576 }
38577 return false;
38578 }
38579
38580 // Only optimize Ands to prevent shrinking a constant that could be
38581 // matched by movzx.
38582 if (Opcode != ISD::AND)
38583 return false;
38584
38585 // Make sure the RHS really is a constant.
38586 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38587 if (!C)
38588 return false;
38589
38590 const APInt &Mask = C->getAPIntValue();
38591
38592 // Clear all non-demanded bits initially.
38593 APInt ShrunkMask = Mask & DemandedBits;
38594
38595 // Find the width of the shrunk mask.
38596 unsigned Width = ShrunkMask.getActiveBits();
38597
38598 // If the mask is all 0s there's nothing to do here.
38599 if (Width == 0)
38600 return false;
38601
38602 // Find the next power of 2 width, rounding up to a byte.
38603 Width = llvm::bit_ceil(std::max(Width, 8U));
38604 // Truncate the width to size to handle illegal types.
38605 Width = std::min(Width, EltSize);
38606
38607 // Calculate a possible zero extend mask for this constant.
38608 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38609
38610 // If we aren't changing the mask, just return true to keep it and prevent
38611 // the caller from optimizing.
38612 if (ZeroExtendMask == Mask)
38613 return true;
38614
38615 // Make sure the new mask can be represented by a combination of mask bits
38616 // and non-demanded bits.
38617 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38618 return false;
38619
38620 // Replace the constant with the zero extend mask.
38621 SDLoc DL(Op);
38622 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38623 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38624 return TLO.CombineTo(Op, NewOp);
38625}
38626
38627void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
38628 KnownBits &Known,
38629 const APInt &DemandedElts,
38630 const SelectionDAG &DAG,
38631 unsigned Depth) const {
38632 unsigned BitWidth = Known.getBitWidth();
38633 unsigned NumElts = DemandedElts.getBitWidth();
38634 unsigned Opc = Op.getOpcode();
38635 EVT VT = Op.getValueType();
38636 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38637 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38638 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38639 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38640 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
38641 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))
;
38642
38643 Known.resetAll();
38644 switch (Opc) {
38645 default: break;
38646 case X86ISD::MUL_IMM: {
38647 KnownBits Known2;
38648 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38649 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38650 Known = KnownBits::mul(Known, Known2);
38651 break;
38652 }
38653 case X86ISD::SETCC:
38654 Known.Zero.setBitsFrom(1);
38655 break;
38656 case X86ISD::MOVMSK: {
38657 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38658 Known.Zero.setBitsFrom(NumLoBits);
38659 break;
38660 }
38661 case X86ISD::PEXTRB:
38662 case X86ISD::PEXTRW: {
38663 SDValue Src = Op.getOperand(0);
38664 EVT SrcVT = Src.getValueType();
38665 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38666 Op.getConstantOperandVal(1));
38667 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38668 Known = Known.anyextOrTrunc(BitWidth);
38669 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38670 break;
38671 }
38672 case X86ISD::VSRAI:
38673 case X86ISD::VSHLI:
38674 case X86ISD::VSRLI: {
38675 unsigned ShAmt = Op.getConstantOperandVal(1);
38676 if (ShAmt >= VT.getScalarSizeInBits()) {
38677 // Out of range logical bit shifts are guaranteed to be zero.
38678 // Out of range arithmetic bit shifts splat the sign bit.
38679 if (Opc != X86ISD::VSRAI) {
38680 Known.setAllZero();
38681 break;
38682 }
38683
38684 ShAmt = VT.getScalarSizeInBits() - 1;
38685 }
38686
38687 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38688 if (Opc == X86ISD::VSHLI) {
38689 Known.Zero <<= ShAmt;
38690 Known.One <<= ShAmt;
38691 // Low bits are known zero.
38692 Known.Zero.setLowBits(ShAmt);
38693 } else if (Opc == X86ISD::VSRLI) {
38694 Known.Zero.lshrInPlace(ShAmt);
38695 Known.One.lshrInPlace(ShAmt);
38696 // High bits are known zero.
38697 Known.Zero.setHighBits(ShAmt);
38698 } else {
38699 Known.Zero.ashrInPlace(ShAmt);
38700 Known.One.ashrInPlace(ShAmt);
38701 }
38702 break;
38703 }
38704 case X86ISD::PACKUS: {
38705 // PACKUS is just a truncation if the upper half is zero.
38706 APInt DemandedLHS, DemandedRHS;
38707 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38708
38709 Known.One = APInt::getAllOnes(BitWidth * 2);
38710 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38711
38712 KnownBits Known2;
38713 if (!!DemandedLHS) {
38714 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38715 Known = KnownBits::commonBits(Known, Known2);
38716 }
38717 if (!!DemandedRHS) {
38718 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38719 Known = KnownBits::commonBits(Known, Known2);
38720 }
38721
38722 if (Known.countMinLeadingZeros() < BitWidth)
38723 Known.resetAll();
38724 Known = Known.trunc(BitWidth);
38725 break;
38726 }
38727 case X86ISD::VBROADCAST: {
38728 SDValue Src = Op.getOperand(0);
38729 if (!Src.getSimpleValueType().isVector()) {
38730 Known = DAG.computeKnownBits(Src, Depth + 1);
38731 return;
38732 }
38733 break;
38734 }
38735 case X86ISD::AND: {
38736 if (Op.getResNo() == 0) {
38737 KnownBits Known2;
38738 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38739 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38740 Known &= Known2;
38741 }
38742 break;
38743 }
38744 case X86ISD::ANDNP: {
38745 KnownBits Known2;
38746 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38747 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38748
38749 // ANDNP = (~X & Y);
38750 Known.One &= Known2.Zero;
38751 Known.Zero |= Known2.One;
38752 break;
38753 }
38754 case X86ISD::FOR: {
38755 KnownBits Known2;
38756 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38757 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38758
38759 Known |= Known2;
38760 break;
38761 }
38762 case X86ISD::PSADBW: {
38763 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))
38764 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))
38765 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))
;
38766
38767 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
38768 Known.Zero.setBitsFrom(16);
38769 break;
38770 }
38771 case X86ISD::PCMPGT:
38772 case X86ISD::PCMPEQ: {
38773 KnownBits KnownLhs =
38774 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38775 KnownBits KnownRhs =
38776 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38777 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38778 ? KnownBits::eq(KnownLhs, KnownRhs)
38779 : KnownBits::sgt(KnownLhs, KnownRhs);
38780 if (Res) {
38781 if (*Res)
38782 Known.setAllOnes();
38783 else
38784 Known.setAllZero();
38785 }
38786 break;
38787 }
38788 case X86ISD::PMULUDQ: {
38789 KnownBits Known2;
38790 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38791 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38792
38793 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38794 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38795 Known = KnownBits::mul(Known, Known2);
38796 break;
38797 }
38798 case X86ISD::CMOV: {
38799 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38800 // If we don't know any bits, early out.
38801 if (Known.isUnknown())
38802 break;
38803 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38804
38805 // Only known if known in both the LHS and RHS.
38806 Known = KnownBits::commonBits(Known, Known2);
38807 break;
38808 }
38809 case X86ISD::BEXTR:
38810 case X86ISD::BEXTRI: {
38811 SDValue Op0 = Op.getOperand(0);
38812 SDValue Op1 = Op.getOperand(1);
38813
38814 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38815 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38816 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38817
38818 // If the length is 0, the result is 0.
38819 if (Length == 0) {
38820 Known.setAllZero();
38821 break;
38822 }
38823
38824 if ((Shift + Length) <= BitWidth) {
38825 Known = DAG.computeKnownBits(Op0, Depth + 1);
38826 Known = Known.extractBits(Length, Shift);
38827 Known = Known.zextOrTrunc(BitWidth);
38828 }
38829 }
38830 break;
38831 }
38832 case X86ISD::PDEP: {
38833 KnownBits Known2;
38834 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38835 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38836 // Zeros are retained from the mask operand. But not ones.
38837 Known.One.clearAllBits();
38838 // The result will have at least as many trailing zeros as the non-mask
38839 // operand since bits can only map to the same or higher bit position.
38840 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38841 break;
38842 }
38843 case X86ISD::PEXT: {
38844 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38845 // The result has as many leading zeros as the number of zeroes in the mask.
38846 unsigned Count = Known.Zero.popcount();
38847 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38848 Known.One.clearAllBits();
38849 break;
38850 }
38851 case X86ISD::VTRUNC:
38852 case X86ISD::VTRUNCS:
38853 case X86ISD::VTRUNCUS:
38854 case X86ISD::CVTSI2P:
38855 case X86ISD::CVTUI2P:
38856 case X86ISD::CVTP2SI:
38857 case X86ISD::CVTP2UI:
38858 case X86ISD::MCVTP2SI:
38859 case X86ISD::MCVTP2UI:
38860 case X86ISD::CVTTP2SI:
38861 case X86ISD::CVTTP2UI:
38862 case X86ISD::MCVTTP2SI:
38863 case X86ISD::MCVTTP2UI:
38864 case X86ISD::MCVTSI2P:
38865 case X86ISD::MCVTUI2P:
38866 case X86ISD::VFPROUND:
38867 case X86ISD::VMFPROUND:
38868 case X86ISD::CVTPS2PH:
38869 case X86ISD::MCVTPS2PH: {
38870 // Truncations/Conversions - upper elements are known zero.
38871 EVT SrcVT = Op.getOperand(0).getValueType();
38872 if (SrcVT.isVector()) {
38873 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38874 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38875 Known.setAllZero();
38876 }
38877 break;
38878 }
38879 case X86ISD::STRICT_CVTTP2SI:
38880 case X86ISD::STRICT_CVTTP2UI:
38881 case X86ISD::STRICT_CVTSI2P:
38882 case X86ISD::STRICT_CVTUI2P:
38883 case X86ISD::STRICT_VFPROUND:
38884 case X86ISD::STRICT_CVTPS2PH: {
38885 // Strict Conversions - upper elements are known zero.
38886 EVT SrcVT = Op.getOperand(1).getValueType();
38887 if (SrcVT.isVector()) {
38888 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38889 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38890 Known.setAllZero();
38891 }
38892 break;
38893 }
38894 case X86ISD::MOVQ2DQ: {
38895 // Move from MMX to XMM. Upper half of XMM should be 0.
38896 if (DemandedElts.countr_zero() >= (NumElts / 2))
38897 Known.setAllZero();
38898 break;
38899 }
38900 case X86ISD::VBROADCAST_LOAD: {
38901 APInt UndefElts;
38902 SmallVector<APInt, 16> EltBits;
38903 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38904 /*AllowWholeUndefs*/ false,
38905 /*AllowPartialUndefs*/ false)) {
38906 Known.Zero.setAllBits();
38907 Known.One.setAllBits();
38908 for (unsigned I = 0; I != NumElts; ++I) {
38909 if (!DemandedElts[I])
38910 continue;
38911 if (UndefElts[I]) {
38912 Known.resetAll();
38913 break;
38914 }
38915 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38916 Known = KnownBits::commonBits(Known, Known2);
38917 }
38918 return;
38919 }
38920 break;
38921 }
38922 }
38923
38924 // Handle target shuffles.
38925 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38926 if (isTargetShuffle(Opc)) {
38927 SmallVector<int, 64> Mask;
38928 SmallVector<SDValue, 2> Ops;
38929 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38930 unsigned NumOps = Ops.size();
38931 unsigned NumElts = VT.getVectorNumElements();
38932 if (Mask.size() == NumElts) {
38933 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38934 Known.Zero.setAllBits(); Known.One.setAllBits();
38935 for (unsigned i = 0; i != NumElts; ++i) {
38936 if (!DemandedElts[i])
38937 continue;
38938 int M = Mask[i];
38939 if (M == SM_SentinelUndef) {
38940 // For UNDEF elements, we don't know anything about the common state
38941 // of the shuffle result.
38942 Known.resetAll();
38943 break;
38944 }
38945 if (M == SM_SentinelZero) {
38946 Known.One.clearAllBits();
38947 continue;
38948 }
38949 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__
__PRETTY_FUNCTION__))
38950 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__
__PRETTY_FUNCTION__))
;
38951
38952 unsigned OpIdx = (unsigned)M / NumElts;
38953 unsigned EltIdx = (unsigned)M % NumElts;
38954 if (Ops[OpIdx].getValueType() != VT) {
38955 // TODO - handle target shuffle ops with different value types.
38956 Known.resetAll();
38957 break;
38958 }
38959 DemandedOps[OpIdx].setBit(EltIdx);
38960 }
38961 // Known bits are the values that are shared by every demanded element.
38962 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38963 if (!DemandedOps[i])
38964 continue;
38965 KnownBits Known2 =
38966 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38967 Known = KnownBits::commonBits(Known, Known2);
38968 }
38969 }
38970 }
38971 }
38972}
38973
38974unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
38975 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38976 unsigned Depth) const {
38977 EVT VT = Op.getValueType();
38978 unsigned VTBits = VT.getScalarSizeInBits();
38979 unsigned Opcode = Op.getOpcode();
38980 switch (Opcode) {
38981 case X86ISD::SETCC_CARRY:
38982 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38983 return VTBits;
38984
38985 case X86ISD::VTRUNC: {
38986 SDValue Src = Op.getOperand(0);
38987 MVT SrcVT = Src.getSimpleValueType();
38988 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38989 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38989, __extension__
__PRETTY_FUNCTION__))
;
38990 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38991 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38992 if (Tmp > (NumSrcBits - VTBits))
38993 return Tmp - (NumSrcBits - VTBits);
38994 return 1;
38995 }
38996
38997 case X86ISD::PACKSS: {
38998 // PACKSS is just a truncation if the sign bits extend to the packed size.
38999 APInt DemandedLHS, DemandedRHS;
39000 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39001 DemandedRHS);
39002
39003 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39004 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39005 if (!!DemandedLHS)
39006 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
39007 if (!!DemandedRHS)
39008 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
39009 unsigned Tmp = std::min(Tmp0, Tmp1);
39010 if (Tmp > (SrcBits - VTBits))
39011 return Tmp - (SrcBits - VTBits);
39012 return 1;
39013 }
39014
39015 case X86ISD::VBROADCAST: {
39016 SDValue Src = Op.getOperand(0);
39017 if (!Src.getSimpleValueType().isVector())
39018 return DAG.ComputeNumSignBits(Src, Depth + 1);
39019 break;
39020 }
39021
39022 case X86ISD::VSHLI: {
39023 SDValue Src = Op.getOperand(0);
39024 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39025 if (ShiftVal.uge(VTBits))
39026 return VTBits; // Shifted all bits out --> zero.
39027 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39028 if (ShiftVal.uge(Tmp))
39029 return 1; // Shifted all sign bits out --> unknown.
39030 return Tmp - ShiftVal.getZExtValue();
39031 }
39032
39033 case X86ISD::VSRAI: {
39034 SDValue Src = Op.getOperand(0);
39035 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39036 if (ShiftVal.uge(VTBits - 1))
39037 return VTBits; // Sign splat.
39038 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39039 ShiftVal += Tmp;
39040 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39041 }
39042
39043 case X86ISD::FSETCC:
39044 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39045 if (VT == MVT::f32 || VT == MVT::f64 ||
39046 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39047 return VTBits;
39048 break;
39049
39050 case X86ISD::PCMPGT:
39051 case X86ISD::PCMPEQ:
39052 case X86ISD::CMPP:
39053 case X86ISD::VPCOM:
39054 case X86ISD::VPCOMU:
39055 // Vector compares return zero/all-bits result values.
39056 return VTBits;
39057
39058 case X86ISD::ANDNP: {
39059 unsigned Tmp0 =
39060 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39061 if (Tmp0 == 1) return 1; // Early out.
39062 unsigned Tmp1 =
39063 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39064 return std::min(Tmp0, Tmp1);
39065 }
39066
39067 case X86ISD::CMOV: {
39068 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39069 if (Tmp0 == 1) return 1; // Early out.
39070 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39071 return std::min(Tmp0, Tmp1);
39072 }
39073 }
39074
39075 // Handle target shuffles.
39076 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39077 if (isTargetShuffle(Opcode)) {
39078 SmallVector<int, 64> Mask;
39079 SmallVector<SDValue, 2> Ops;
39080 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
39081 unsigned NumOps = Ops.size();
39082 unsigned NumElts = VT.getVectorNumElements();
39083 if (Mask.size() == NumElts) {
39084 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39085 for (unsigned i = 0; i != NumElts; ++i) {
39086 if (!DemandedElts[i])
39087 continue;
39088 int M = Mask[i];
39089 if (M == SM_SentinelUndef) {
39090 // For UNDEF elements, we don't know anything about the common state
39091 // of the shuffle result.
39092 return 1;
39093 } else if (M == SM_SentinelZero) {
39094 // Zero = all sign bits.
39095 continue;
39096 }
39097 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__
__PRETTY_FUNCTION__))
39098 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__
__PRETTY_FUNCTION__))
;
39099
39100 unsigned OpIdx = (unsigned)M / NumElts;
39101 unsigned EltIdx = (unsigned)M % NumElts;
39102 if (Ops[OpIdx].getValueType() != VT) {
39103 // TODO - handle target shuffle ops with different value types.
39104 return 1;
39105 }
39106 DemandedOps[OpIdx].setBit(EltIdx);
39107 }
39108 unsigned Tmp0 = VTBits;
39109 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39110 if (!DemandedOps[i])
39111 continue;
39112 unsigned Tmp1 =
39113 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39114 Tmp0 = std::min(Tmp0, Tmp1);
39115 }
39116 return Tmp0;
39117 }
39118 }
39119 }
39120
39121 // Fallback case.
39122 return 1;
39123}
39124
39125SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
39126 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39127 return N->getOperand(0);
39128 return N;
39129}
39130
39131// Helper to look for a normal load that can be narrowed into a vzload with the
39132// specified VT and memory VT. Returns SDValue() on failure.
39133static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
39134 SelectionDAG &DAG) {
39135 // Can't if the load is volatile or atomic.
39136 if (!LN->isSimple())
39137 return SDValue();
39138
39139 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39140 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39141 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39142 LN->getPointerInfo(), LN->getOriginalAlign(),
39143 LN->getMemOperand()->getFlags());
39144}
39145
39146// Attempt to match a combined shuffle mask against supported unary shuffle
39147// instructions.
39148// TODO: Investigate sharing more of this with shuffle lowering.
39149static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39150 bool AllowFloatDomain, bool AllowIntDomain,
39151 SDValue V1, const SelectionDAG &DAG,
39152 const X86Subtarget &Subtarget, unsigned &Shuffle,
39153 MVT &SrcVT, MVT &DstVT) {
39154 unsigned NumMaskElts = Mask.size();
39155 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39156
39157 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39158 if (Mask[0] == 0 &&
39159 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39160 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39161 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39162 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39163 Shuffle = X86ISD::VZEXT_MOVL;
39164 if (MaskEltSize == 16)
39165 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39166 else
39167 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39168 return true;
39169 }
39170 }
39171
39172 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
39173 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
39174 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39175 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
39176 unsigned MaxScale = 64 / MaskEltSize;
39177 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39178 bool MatchAny = true;
39179 bool MatchZero = true;
39180 unsigned NumDstElts = NumMaskElts / Scale;
39181 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
39182 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39183 MatchAny = MatchZero = false;
39184 break;
39185 }
39186 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
39187 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
39188 }
39189 if (MatchAny || MatchZero) {
39190 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39190, __extension__
__PRETTY_FUNCTION__))
;
39191 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39192 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
39193 MVT::getIntegerVT(MaskEltSize);
39194 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39195
39196 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
39197 if (SrcVT.getVectorNumElements() != NumDstElts)
39198 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39199
39200 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39201 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39202 return true;
39203 }
39204 }
39205 }
39206
39207 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39208 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39209 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39210 isUndefOrEqual(Mask[0], 0) &&
39211 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39212 Shuffle = X86ISD::VZEXT_MOVL;
39213 if (MaskEltSize == 16)
39214 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39215 else
39216 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39217 return true;
39218 }
39219
39220 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39221 // instructions are no slower than UNPCKLPD but has the option to
39222 // fold the input operand into even an unaligned memory load.
39223 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39224 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39225 Shuffle = X86ISD::MOVDDUP;
39226 SrcVT = DstVT = MVT::v2f64;
39227 return true;
39228 }
39229 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39230 Shuffle = X86ISD::MOVSLDUP;
39231 SrcVT = DstVT = MVT::v4f32;
39232 return true;
39233 }
39234 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39235 Shuffle = X86ISD::MOVSHDUP;
39236 SrcVT = DstVT = MVT::v4f32;
39237 return true;
39238 }
39239 }
39240
39241 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39242 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39242, __extension__
__PRETTY_FUNCTION__))
;
39243 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39244 Shuffle = X86ISD::MOVDDUP;
39245 SrcVT = DstVT = MVT::v4f64;
39246 return true;
39247 }
39248 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39249 V1)) {
39250 Shuffle = X86ISD::MOVSLDUP;
39251 SrcVT = DstVT = MVT::v8f32;
39252 return true;
39253 }
39254 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39255 V1)) {
39256 Shuffle = X86ISD::MOVSHDUP;
39257 SrcVT = DstVT = MVT::v8f32;
39258 return true;
39259 }
39260 }
39261
39262 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39263 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__
__PRETTY_FUNCTION__))
39264 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__
__PRETTY_FUNCTION__))
;
39265 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39266 V1)) {
39267 Shuffle = X86ISD::MOVDDUP;
39268 SrcVT = DstVT = MVT::v8f64;
39269 return true;
39270 }
39271 if (isTargetShuffleEquivalent(
39272 MaskVT, Mask,
39273 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39274 Shuffle = X86ISD::MOVSLDUP;
39275 SrcVT = DstVT = MVT::v16f32;
39276 return true;
39277 }
39278 if (isTargetShuffleEquivalent(
39279 MaskVT, Mask,
39280 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39281 Shuffle = X86ISD::MOVSHDUP;
39282 SrcVT = DstVT = MVT::v16f32;
39283 return true;
39284 }
39285 }
39286
39287 return false;
39288}
39289
39290// Attempt to match a combined shuffle mask against supported unary immediate
39291// permute instructions.
39292// TODO: Investigate sharing more of this with shuffle lowering.
39293static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
39294 const APInt &Zeroable,
39295 bool AllowFloatDomain, bool AllowIntDomain,
39296 const SelectionDAG &DAG,
39297 const X86Subtarget &Subtarget,
39298 unsigned &Shuffle, MVT &ShuffleVT,
39299 unsigned &PermuteImm) {
39300 unsigned NumMaskElts = Mask.size();
39301 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39302 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39303 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39304 bool ContainsZeros = isAnyZero(Mask);
39305
39306 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39307 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39308 // Check for lane crossing permutes.
39309 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39310 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39311 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39312 Shuffle = X86ISD::VPERMI;
39313 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39314 PermuteImm = getV4X86ShuffleImm(Mask);
39315 return true;
39316 }
39317 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39318 SmallVector<int, 4> RepeatedMask;
39319 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39320 Shuffle = X86ISD::VPERMI;
39321 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39322 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39323 return true;
39324 }
39325 }
39326 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39327 // VPERMILPD can permute with a non-repeating shuffle.
39328 Shuffle = X86ISD::VPERMILPI;
39329 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39330 PermuteImm = 0;
39331 for (int i = 0, e = Mask.size(); i != e; ++i) {
39332 int M = Mask[i];
39333 if (M == SM_SentinelUndef)
39334 continue;
39335 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39335, __extension__
__PRETTY_FUNCTION__))
;
39336 PermuteImm |= (M & 1) << i;
39337 }
39338 return true;
39339 }
39340 }
39341
39342 // We are checking for shuffle match or shift match. Loop twice so we can
39343 // order which we try and match first depending on target preference.
39344 for (unsigned Order = 0; Order < 2; ++Order) {
39345 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39346 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39347 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39348 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39349 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39350 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39351 SmallVector<int, 4> RepeatedMask;
39352 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39353 // Narrow the repeated mask to create 32-bit element permutes.
39354 SmallVector<int, 4> WordMask = RepeatedMask;
39355 if (MaskScalarSizeInBits == 64)
39356 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39357
39358 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39359 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39360 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39361 PermuteImm = getV4X86ShuffleImm(WordMask);
39362 return true;
39363 }
39364 }
39365
39366 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39367 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39368 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39369 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39370 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39371 SmallVector<int, 4> RepeatedMask;
39372 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39373 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39374 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39375
39376 // PSHUFLW: permute lower 4 elements only.
39377 if (isUndefOrInRange(LoMask, 0, 4) &&
39378 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39379 Shuffle = X86ISD::PSHUFLW;
39380 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39381 PermuteImm = getV4X86ShuffleImm(LoMask);
39382 return true;
39383 }
39384
39385 // PSHUFHW: permute upper 4 elements only.
39386 if (isUndefOrInRange(HiMask, 4, 8) &&
39387 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39388 // Offset the HiMask so that we can create the shuffle immediate.
39389 int OffsetHiMask[4];
39390 for (int i = 0; i != 4; ++i)
39391 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39392
39393 Shuffle = X86ISD::PSHUFHW;
39394 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39395 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39396 return true;
39397 }
39398 }
39399 }
39400 } else {
39401 // Attempt to match against bit rotates.
39402 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39403 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39404 Subtarget.hasAVX512())) {
39405 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39406 Subtarget, Mask);
39407 if (0 < RotateAmt) {
39408 Shuffle = X86ISD::VROTLI;
39409 PermuteImm = (unsigned)RotateAmt;
39410 return true;
39411 }
39412 }
39413 }
39414 // Attempt to match against byte/bit shifts.
39415 if (AllowIntDomain &&
39416 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39417 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39418 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39419 int ShiftAmt =
39420 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39421 Zeroable, Subtarget);
39422 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39423 32 <= ShuffleVT.getScalarSizeInBits())) {
39424 // Byte shifts can be slower so only match them on second attempt.
39425 if (Order == 0 &&
39426 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39427 continue;
39428
39429 PermuteImm = (unsigned)ShiftAmt;
39430 return true;
39431 }
39432
39433 }
39434 }
39435
39436 return false;
39437}
39438
39439// Attempt to match a combined unary shuffle mask against supported binary
39440// shuffle instructions.
39441// TODO: Investigate sharing more of this with shuffle lowering.
39442static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39443 bool AllowFloatDomain, bool AllowIntDomain,
39444 SDValue &V1, SDValue &V2, const SDLoc &DL,
39445 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39446 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39447 bool IsUnary) {
39448 unsigned NumMaskElts = Mask.size();
39449 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39450 unsigned SizeInBits = MaskVT.getSizeInBits();
39451
39452 if (MaskVT.is128BitVector()) {
39453 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39454 AllowFloatDomain) {
39455 V2 = V1;
39456 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39457 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39458 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39459 return true;
39460 }
39461 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39462 AllowFloatDomain) {
39463 V2 = V1;
39464 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39465 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39466 return true;
39467 }
39468 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39469 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39470 std::swap(V1, V2);
39471 Shuffle = X86ISD::MOVSD;
39472 SrcVT = DstVT = MVT::v2f64;
39473 return true;
39474 }
39475 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39476 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39477 Shuffle = X86ISD::MOVSS;
39478 SrcVT = DstVT = MVT::v4f32;
39479 return true;
39480 }
39481 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39482 DAG) &&
39483 Subtarget.hasFP16()) {
39484 Shuffle = X86ISD::MOVSH;
39485 SrcVT = DstVT = MVT::v8f16;
39486 return true;
39487 }
39488 }
39489
39490 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39491 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39492 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39493 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39494 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39495 Subtarget)) {
39496 DstVT = MaskVT;
39497 return true;
39498 }
39499 }
39500
39501 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39502 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39503 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39504 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39505 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39506 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
39507 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39508 Subtarget)) {
39509 SrcVT = DstVT = MaskVT;
39510 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39511 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39512 return true;
39513 }
39514 }
39515
39516 // Attempt to match against a OR if we're performing a blend shuffle and the
39517 // non-blended source element is zero in each case.
39518 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39519 if (SizeInBits == V1.getValueSizeInBits() &&
39520 SizeInBits == V2.getValueSizeInBits() &&
39521 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39522 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39523 bool IsBlend = true;
39524 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39525 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39526 unsigned Scale1 = NumV1Elts / NumMaskElts;
39527 unsigned Scale2 = NumV2Elts / NumMaskElts;
39528 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39529 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39530 for (unsigned i = 0; i != NumMaskElts; ++i) {
39531 int M = Mask[i];
39532 if (M == SM_SentinelUndef)
39533 continue;
39534 if (M == SM_SentinelZero) {
39535 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39536 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39537 continue;
39538 }
39539 if (M == (int)i) {
39540 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39541 continue;
39542 }
39543 if (M == (int)(i + NumMaskElts)) {
39544 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39545 continue;
39546 }
39547 IsBlend = false;
39548 break;
39549 }
39550 if (IsBlend) {
39551 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39552 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39553 Shuffle = ISD::OR;
39554 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39555 return true;
39556 }
39557 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39558 // FIXME: handle mismatched sizes?
39559 // TODO: investigate if `ISD::OR` handling in
39560 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39561 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39562 unsigned NumElts = V.getValueType().getVectorNumElements();
39563 KnownBits Known(NumElts);
39564 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39565 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39566 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39567 if (PeepholeKnown.isZero())
39568 Known.Zero.setBit(EltIdx);
39569 if (PeepholeKnown.isAllOnes())
39570 Known.One.setBit(EltIdx);
39571 }
39572 return Known;
39573 };
39574
39575 KnownBits V1Known = computeKnownBitsElementWise(V1);
39576 KnownBits V2Known = computeKnownBitsElementWise(V2);
39577
39578 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39579 int M = Mask[i];
39580 if (M == SM_SentinelUndef)
39581 continue;
39582 if (M == SM_SentinelZero) {
39583 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39584 continue;
39585 }
39586 if (M == (int)i) {
39587 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39588 continue;
39589 }
39590 if (M == (int)(i + NumMaskElts)) {
39591 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39592 continue;
39593 }
39594 llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39594)
;
39595 }
39596 if (IsBlend) {
39597 Shuffle = ISD::OR;
39598 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39599 return true;
39600 }
39601 }
39602 }
39603 }
39604
39605 return false;
39606}
39607
39608static bool matchBinaryPermuteShuffle(
39609 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39610 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39611 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39612 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39613 unsigned NumMaskElts = Mask.size();
39614 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39615
39616 // Attempt to match against VALIGND/VALIGNQ rotate.
39617 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39618 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39619 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39620 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39621 if (!isAnyZero(Mask)) {
39622 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39623 if (0 < Rotation) {
39624 Shuffle = X86ISD::VALIGN;
39625 if (EltSizeInBits == 64)
39626 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39627 else
39628 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39629 PermuteImm = Rotation;
39630 return true;
39631 }
39632 }
39633 }
39634
39635 // Attempt to match against PALIGNR byte rotate.
39636 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39637 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39638 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39639 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39640 if (0 < ByteRotation) {
39641 Shuffle = X86ISD::PALIGNR;
39642 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39643 PermuteImm = ByteRotation;
39644 return true;
39645 }
39646 }
39647
39648 // Attempt to combine to X86ISD::BLENDI.
39649 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39650 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39651 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39652 uint64_t BlendMask = 0;
39653 bool ForceV1Zero = false, ForceV2Zero = false;
39654 SmallVector<int, 8> TargetMask(Mask);
39655 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39656 ForceV2Zero, BlendMask)) {
39657 if (MaskVT == MVT::v16i16) {
39658 // We can only use v16i16 PBLENDW if the lanes are repeated.
39659 SmallVector<int, 8> RepeatedMask;
39660 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39661 RepeatedMask)) {
39662 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__
__PRETTY_FUNCTION__))
39663 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__
__PRETTY_FUNCTION__))
;
39664 PermuteImm = 0;
39665 for (int i = 0; i < 8; ++i)
39666 if (RepeatedMask[i] >= 8)
39667 PermuteImm |= 1 << i;
39668 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39669 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39670 Shuffle = X86ISD::BLENDI;
39671 ShuffleVT = MaskVT;
39672 return true;
39673 }
39674 } else {
39675 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39676 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39677 PermuteImm = (unsigned)BlendMask;
39678 Shuffle = X86ISD::BLENDI;
39679 ShuffleVT = MaskVT;
39680 return true;
39681 }
39682 }
39683 }
39684
39685 // Attempt to combine to INSERTPS, but only if it has elements that need to
39686 // be set to zero.
39687 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39688 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39689 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39690 Shuffle = X86ISD::INSERTPS;
39691 ShuffleVT = MVT::v4f32;
39692 return true;
39693 }
39694
39695 // Attempt to combine to SHUFPD.
39696 if (AllowFloatDomain && EltSizeInBits == 64 &&
39697 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39698 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39699 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39700 bool ForceV1Zero = false, ForceV2Zero = false;
39701 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39702 PermuteImm, Mask, Zeroable)) {
39703 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39704 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39705 Shuffle = X86ISD::SHUFP;
39706 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39707 return true;
39708 }
39709 }
39710
39711 // Attempt to combine to SHUFPS.
39712 if (AllowFloatDomain && EltSizeInBits == 32 &&
39713 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39714 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39715 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39716 SmallVector<int, 4> RepeatedMask;
39717 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39718 // Match each half of the repeated mask, to determine if its just
39719 // referencing one of the vectors, is zeroable or entirely undef.
39720 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39721 int M0 = RepeatedMask[Offset];
39722 int M1 = RepeatedMask[Offset + 1];
39723
39724 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39725 return DAG.getUNDEF(MaskVT);
39726 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39727 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39728 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39729 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39730 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39731 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39732 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39733 return V1;
39734 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39735 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39736 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39737 return V2;
39738 }
39739
39740 return SDValue();
39741 };
39742
39743 int ShufMask[4] = {-1, -1, -1, -1};
39744 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39745 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39746
39747 if (Lo && Hi) {
39748 V1 = Lo;
39749 V2 = Hi;
39750 Shuffle = X86ISD::SHUFP;
39751 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39752 PermuteImm = getV4X86ShuffleImm(ShufMask);
39753 return true;
39754 }
39755 }
39756 }
39757
39758 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39759 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39760 MaskVT.is128BitVector() &&
39761 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39762 Shuffle = X86ISD::INSERTPS;
39763 ShuffleVT = MVT::v4f32;
39764 return true;
39765 }
39766
39767 return false;
39768}
39769
39770static SDValue combineX86ShuffleChainWithExtract(
39771 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39772 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39773 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39774 const X86Subtarget &Subtarget);
39775
39776/// Combine an arbitrary chain of shuffles into a single instruction if
39777/// possible.
39778///
39779/// This is the leaf of the recursive combine below. When we have found some
39780/// chain of single-use x86 shuffle instructions and accumulated the combined
39781/// shuffle mask represented by them, this will try to pattern match that mask
39782/// into either a single instruction if there is a special purpose instruction
39783/// for this operation, or into a PSHUFB instruction which is a fully general
39784/// instruction but should only be used to replace chains over a certain depth.
39785static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
39786 ArrayRef<int> BaseMask, int Depth,
39787 bool HasVariableMask,
39788 bool AllowVariableCrossLaneMask,
39789 bool AllowVariablePerLaneMask,
39790 SelectionDAG &DAG,
39791 const X86Subtarget &Subtarget) {
39792 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39792, __extension__
__PRETTY_FUNCTION__))
;
39793 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__
__PRETTY_FUNCTION__))
39794 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__
__PRETTY_FUNCTION__))
;
39795
39796 SDLoc DL(Root);
39797 MVT RootVT = Root.getSimpleValueType();
39798 unsigned RootSizeInBits = RootVT.getSizeInBits();
39799 unsigned NumRootElts = RootVT.getVectorNumElements();
39800
39801 // Canonicalize shuffle input op to the requested type.
39802 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39803 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39804 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39805 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39806 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39807 return DAG.getBitcast(VT, Op);
39808 };
39809
39810 // Find the inputs that enter the chain. Note that multiple uses are OK
39811 // here, we're not going to remove the operands we find.
39812 bool UnaryShuffle = (Inputs.size() == 1);
39813 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39814 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39815 : peekThroughBitcasts(Inputs[1]));
39816
39817 MVT VT1 = V1.getSimpleValueType();
39818 MVT VT2 = V2.getSimpleValueType();
39819 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__
__PRETTY_FUNCTION__))
39820 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__
__PRETTY_FUNCTION__))
;
39821
39822 SDValue Res;
39823
39824 unsigned NumBaseMaskElts = BaseMask.size();
39825 if (NumBaseMaskElts == 1) {
39826 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39826, __extension__
__PRETTY_FUNCTION__))
;
39827 return CanonicalizeShuffleInput(RootVT, V1);
39828 }
39829
39830 bool OptForSize = DAG.shouldOptForSize();
39831 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39832 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39833 (RootVT.isFloatingPoint() && Depth >= 1) ||
39834 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39835
39836 // Don't combine if we are a AVX512/EVEX target and the mask element size
39837 // is different from the root element size - this would prevent writemasks
39838 // from being reused.
39839 bool IsMaskedShuffle = false;
39840 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39841 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
39842 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39843 IsMaskedShuffle = true;
39844 }
39845 }
39846
39847 // If we are shuffling a splat (and not introducing zeros) then we can just
39848 // use it directly. This works for smaller elements as well as they already
39849 // repeat across each mask element.
39850 if (UnaryShuffle && !isAnyZero(BaseMask) &&
39851 V1.getValueSizeInBits() >= RootSizeInBits &&
39852 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39853 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
39854 return CanonicalizeShuffleInput(RootVT, V1);
39855 }
39856
39857 SmallVector<int, 64> Mask(BaseMask);
39858
39859 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39860 // etc. can be simplified.
39861 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
39862 SmallVector<int> ScaledMask, IdentityMask;
39863 unsigned NumElts = VT1.getVectorNumElements();
39864 if (Mask.size() <= NumElts &&
39865 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
39866 for (unsigned i = 0; i != NumElts; ++i)
39867 IdentityMask.push_back(i);
39868 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
39869 V2))
39870 return CanonicalizeShuffleInput(RootVT, V1);
39871 }
39872 }
39873
39874 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39875 if (RootVT.is512BitVector() &&
39876 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
39877 // If the upper subvectors are zeroable, then an extract+insert is more
39878 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
39879 // to zero the upper subvectors.
39880 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39881 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39882 return SDValue(); // Nothing to do!
39883 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__
__PRETTY_FUNCTION__))
39884 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__
__PRETTY_FUNCTION__))
;
39885 Res = CanonicalizeShuffleInput(RootVT, V1);
39886 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
39887 bool UseZero = isAnyZero(Mask);
39888 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
39889 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
39890 }
39891
39892 // Narrow shuffle mask to v4x128.
39893 SmallVector<int, 4> ScaledMask;
39894 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39894, __extension__
__PRETTY_FUNCTION__))
;
39895 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
39896
39897 // Try to lower to vshuf64x2/vshuf32x4.
39898 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
39899 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
39900 SelectionDAG &DAG) {
39901 unsigned PermMask = 0;
39902 // Insure elements came from the same Op.
39903 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
39904 for (int i = 0; i < 4; ++i) {
39905 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39905, __extension__
__PRETTY_FUNCTION__))
;
39906 if (ScaledMask[i] < 0)
39907 continue;
39908
39909 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
39910 unsigned OpIndex = i / 2;
39911 if (Ops[OpIndex].isUndef())
39912 Ops[OpIndex] = Op;
39913 else if (Ops[OpIndex] != Op)
39914 return SDValue();
39915
39916 // Convert the 128-bit shuffle mask selection values into 128-bit
39917 // selection bits defined by a vshuf64x2 instruction's immediate control
39918 // byte.
39919 PermMask |= (ScaledMask[i] % 4) << (i * 2);
39920 }
39921
39922 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
39923 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
39924 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
39925 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39926 };
39927
39928 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39929 // doesn't work because our mask is for 128 bits and we don't have an MVT
39930 // to match that.
39931 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39932 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39933 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39934 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39935 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39936 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39937 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39938 ScaledMask[1] == (ScaledMask[3] % 2));
39939
39940 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39941 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39942 return SDValue(); // Nothing to do!
39943 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39944 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39945 return DAG.getBitcast(RootVT, V);
39946 }
39947 }
39948
39949 // Handle 128-bit lane shuffles of 256-bit vectors.
39950 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39951 // If the upper half is zeroable, then an extract+insert is more optimal
39952 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39953 // zero the upper half.
39954 if (isUndefOrZero(Mask[1])) {
39955 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39956 return SDValue(); // Nothing to do!
39957 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39957, __extension__
__PRETTY_FUNCTION__))
;
39958 Res = CanonicalizeShuffleInput(RootVT, V1);
39959 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39960 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39961 256);
39962 }
39963
39964 // If we're inserting the low subvector, an insert-subvector 'concat'
39965 // pattern is quicker than VPERM2X128.
39966 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39967 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39968 !Subtarget.hasAVX2()) {
39969 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39970 return SDValue(); // Nothing to do!
39971 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39972 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39973 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39974 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39975 }
39976
39977 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39978 return SDValue(); // Nothing to do!
39979
39980 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39981 // we need to use the zeroing feature.
39982 // Prefer blends for sequential shuffles unless we are optimizing for size.
39983 if (UnaryShuffle &&
39984 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39985 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39986 unsigned PermMask = 0;
39987 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39988 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39989 return DAG.getNode(
39990 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39991 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39992 }
39993
39994 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39995 return SDValue(); // Nothing to do!
39996
39997 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39998 if (!UnaryShuffle && !IsMaskedShuffle) {
39999 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__
__PRETTY_FUNCTION__))
40000 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__
__PRETTY_FUNCTION__))
;
40001 // Prefer blends to X86ISD::VPERM2X128.
40002 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40003 unsigned PermMask = 0;
40004 PermMask |= ((Mask[0] & 3) << 0);
40005 PermMask |= ((Mask[1] & 3) << 4);
40006 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40007 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40008 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40009 CanonicalizeShuffleInput(RootVT, LHS),
40010 CanonicalizeShuffleInput(RootVT, RHS),
40011 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40012 }
40013 }
40014 }
40015
40016 // For masks that have been widened to 128-bit elements or more,
40017 // narrow back down to 64-bit elements.
40018 if (BaseMaskEltSizeInBits > 64) {
40019 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40019, __extension__
__PRETTY_FUNCTION__))
;
40020 int MaskScale = BaseMaskEltSizeInBits / 64;
40021 SmallVector<int, 64> ScaledMask;
40022 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40023 Mask = std::move(ScaledMask);
40024 }
40025
40026 // For masked shuffles, we're trying to match the root width for better
40027 // writemask folding, attempt to scale the mask.
40028 // TODO - variable shuffles might need this to be widened again.
40029 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40030 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40030, __extension__
__PRETTY_FUNCTION__))
;
40031 int MaskScale = NumRootElts / Mask.size();
40032 SmallVector<int, 64> ScaledMask;
40033 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40034 Mask = std::move(ScaledMask);
40035 }
40036
40037 unsigned NumMaskElts = Mask.size();
40038 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40039
40040 // Determine the effective mask value type.
40041 FloatDomain &= (32 <= MaskEltSizeInBits);
40042 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40043 : MVT::getIntegerVT(MaskEltSizeInBits);
40044 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40045
40046 // Only allow legal mask types.
40047 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40048 return SDValue();
40049
40050 // Attempt to match the mask against known shuffle patterns.
40051 MVT ShuffleSrcVT, ShuffleVT;
40052 unsigned Shuffle, PermuteImm;
40053
40054 // Which shuffle domains are permitted?
40055 // Permit domain crossing at higher combine depths.
40056 // TODO: Should we indicate which domain is preferred if both are allowed?
40057 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40058 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40059 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40060
40061 // Determine zeroable mask elements.
40062 APInt KnownUndef, KnownZero;
40063 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40064 APInt Zeroable = KnownUndef | KnownZero;
40065
40066 if (UnaryShuffle) {
40067 // Attempt to match against broadcast-from-vector.
40068 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40069 if ((Subtarget.hasAVX2() ||
40070 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40071 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40072 if (isUndefOrEqual(Mask, 0)) {
40073 if (V1.getValueType() == MaskVT &&
40074 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40075 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40076 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
40077 return SDValue(); // Nothing to do!
40078 Res = V1.getOperand(0);
40079 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40080 return DAG.getBitcast(RootVT, Res);
40081 }
40082 if (Subtarget.hasAVX2()) {
40083 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
40084 return SDValue(); // Nothing to do!
40085 Res = CanonicalizeShuffleInput(MaskVT, V1);
40086 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40087 return DAG.getBitcast(RootVT, Res);
40088 }
40089 }
40090 }
40091
40092 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40093 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40094 (!IsMaskedShuffle ||
40095 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40096 if (Depth == 0 && Root.getOpcode() == Shuffle)
40097 return SDValue(); // Nothing to do!
40098 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40099 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40100 return DAG.getBitcast(RootVT, Res);
40101 }
40102
40103 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40104 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40105 PermuteImm) &&
40106 (!IsMaskedShuffle ||
40107 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40108 if (Depth == 0 && Root.getOpcode() == Shuffle)
40109 return SDValue(); // Nothing to do!
40110 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40111 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40112 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40113 return DAG.getBitcast(RootVT, Res);
40114 }
40115 }
40116
40117 // Attempt to combine to INSERTPS, but only if the inserted element has come
40118 // from a scalar.
40119 // TODO: Handle other insertions here as well?
40120 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40121 Subtarget.hasSSE41() &&
40122 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40123 if (MaskEltSizeInBits == 32) {
40124 SDValue SrcV1 = V1, SrcV2 = V2;
40125 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40126 DAG) &&
40127 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40128 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
40129 return SDValue(); // Nothing to do!
40130 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40131 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40132 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40133 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40134 return DAG.getBitcast(RootVT, Res);
40135 }
40136 }
40137 if (MaskEltSizeInBits == 64 &&
40138 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40139 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40140 V2.getScalarValueSizeInBits() <= 32) {
40141 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
40142 return SDValue(); // Nothing to do!
40143 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40144 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40145 CanonicalizeShuffleInput(MVT::v4f32, V1),
40146 CanonicalizeShuffleInput(MVT::v4f32, V2),
40147 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40148 return DAG.getBitcast(RootVT, Res);
40149 }
40150 }
40151
40152 SDValue NewV1 = V1; // Save operands in case early exit happens.
40153 SDValue NewV2 = V2;
40154 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40155 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40156 ShuffleVT, UnaryShuffle) &&
40157 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40158 if (Depth == 0 && Root.getOpcode() == Shuffle)
40159 return SDValue(); // Nothing to do!
40160 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40161 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40162 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40163 return DAG.getBitcast(RootVT, Res);
40164 }
40165
40166 NewV1 = V1; // Save operands in case early exit happens.
40167 NewV2 = V2;
40168 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40169 AllowIntDomain, NewV1, NewV2, DL, DAG,
40170 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40171 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40172 if (Depth == 0 && Root.getOpcode() == Shuffle)
40173 return SDValue(); // Nothing to do!
40174 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40175 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40176 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40177 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40178 return DAG.getBitcast(RootVT, Res);
40179 }
40180
40181 // Typically from here on, we need an integer version of MaskVT.
40182 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40183 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40184
40185 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40186 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40187 uint64_t BitLen, BitIdx;
40188 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40189 Zeroable)) {
40190 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
40191 return SDValue(); // Nothing to do!
40192 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40193 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40194 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40195 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40196 return DAG.getBitcast(RootVT, Res);
40197 }
40198
40199 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40200 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
40201 return SDValue(); // Nothing to do!
40202 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40203 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40204 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40205 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40206 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40207 return DAG.getBitcast(RootVT, Res);
40208 }
40209 }
40210
40211 // Match shuffle against TRUNCATE patterns.
40212 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40213 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40214 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40215 Subtarget)) {
40216 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40217 ShuffleSrcVT.getVectorNumElements();
40218 unsigned Opc =
40219 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
40220 if (Depth == 0 && Root.getOpcode() == Opc)
40221 return SDValue(); // Nothing to do!
40222 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40223 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40224 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40225 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40226 return DAG.getBitcast(RootVT, Res);
40227 }
40228
40229 // Do we need a more general binary truncation pattern?
40230 if (RootSizeInBits < 512 &&
40231 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40232 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40233 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40234 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40235 // Bail if this was already a truncation or PACK node.
40236 // We sometimes fail to match PACK if we demand known undef elements.
40237 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
40238 Root.getOpcode() == X86ISD::PACKSS ||
40239 Root.getOpcode() == X86ISD::PACKUS))
40240 return SDValue(); // Nothing to do!
40241 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40242 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40243 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40244 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40245 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40246 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40247 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40248 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40249 return DAG.getBitcast(RootVT, Res);
40250 }
40251 }
40252
40253 // Don't try to re-form single instruction chains under any circumstances now
40254 // that we've done encoding canonicalization for them.
40255 if (Depth < 1)
40256 return SDValue();
40257
40258 // Depth threshold above which we can efficiently use variable mask shuffles.
40259 int VariableCrossLaneShuffleDepth =
40260 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40261 int VariablePerLaneShuffleDepth =
40262 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40263 AllowVariableCrossLaneMask &=
40264 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
40265 AllowVariablePerLaneMask &=
40266 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40267 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
40268 // higher depth before combining them.
40269 bool AllowBWIVPERMV3 =
40270 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40271
40272 bool MaskContainsZeros = isAnyZero(Mask);
40273
40274 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40275 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40276 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40277 if (Subtarget.hasAVX2() &&
40278 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40279 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40280 Res = CanonicalizeShuffleInput(MaskVT, V1);
40281 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40282 return DAG.getBitcast(RootVT, Res);
40283 }
40284 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40285 if ((Subtarget.hasAVX512() &&
40286 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40287 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40288 (Subtarget.hasBWI() &&
40289 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40290 (Subtarget.hasVBMI() &&
40291 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40292 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40293 V2 = DAG.getUNDEF(MaskVT);
40294 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40295 return DAG.getBitcast(RootVT, Res);
40296 }
40297 }
40298
40299 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40300 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40301 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40302 ((Subtarget.hasAVX512() &&
40303 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40304 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40305 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40306 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40307 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40308 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40309 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40310 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40311 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40312 for (unsigned i = 0; i != NumMaskElts; ++i)
40313 if (Mask[i] == SM_SentinelZero)
40314 Mask[i] = NumMaskElts + i;
40315 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40316 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40317 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40318 return DAG.getBitcast(RootVT, Res);
40319 }
40320
40321 // If that failed and either input is extracted then try to combine as a
40322 // shuffle with the larger type.
40323 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40324 Inputs, Root, BaseMask, Depth, HasVariableMask,
40325 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40326 Subtarget))
40327 return WideShuffle;
40328
40329 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40330 // (non-VLX will pad to 512-bit shuffles).
40331 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40332 ((Subtarget.hasAVX512() &&
40333 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40334 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40335 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40336 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40337 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40338 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40339 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40340 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40341 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40342 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40343 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40344 return DAG.getBitcast(RootVT, Res);
40345 }
40346 return SDValue();
40347 }
40348
40349 // See if we can combine a single input shuffle with zeros to a bit-mask,
40350 // which is much simpler than any shuffle.
40351 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40352 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40353 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
40354 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40355 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40356 APInt UndefElts(NumMaskElts, 0);
40357 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40358 for (unsigned i = 0; i != NumMaskElts; ++i) {
40359 int M = Mask[i];
40360 if (M == SM_SentinelUndef) {
40361 UndefElts.setBit(i);
40362 continue;
40363 }
40364 if (M == SM_SentinelZero)
40365 continue;
40366 EltBits[i] = AllOnes;
40367 }
40368 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40369 Res = CanonicalizeShuffleInput(MaskVT, V1);
40370 unsigned AndOpcode =
40371 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
40372 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40373 return DAG.getBitcast(RootVT, Res);
40374 }
40375
40376 // If we have a single input shuffle with different shuffle patterns in the
40377 // the 128-bit lanes use the variable mask to VPERMILPS.
40378 // TODO Combine other mask types at higher depths.
40379 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40380 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40381 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40382 SmallVector<SDValue, 16> VPermIdx;
40383 for (int M : Mask) {
40384 SDValue Idx =
40385 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40386 VPermIdx.push_back(Idx);
40387 }
40388 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40389 Res = CanonicalizeShuffleInput(MaskVT, V1);
40390 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40391 return DAG.getBitcast(RootVT, Res);
40392 }
40393
40394 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40395 // to VPERMIL2PD/VPERMIL2PS.
40396 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40397 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40398 MaskVT == MVT::v8f32)) {
40399 // VPERMIL2 Operation.
40400 // Bits[3] - Match Bit.
40401 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40402 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40403 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40404 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40405 SmallVector<int, 8> VPerm2Idx;
40406 unsigned M2ZImm = 0;
40407 for (int M : Mask) {
40408 if (M == SM_SentinelUndef) {
40409 VPerm2Idx.push_back(-1);
40410 continue;
40411 }
40412 if (M == SM_SentinelZero) {
40413 M2ZImm = 2;
40414 VPerm2Idx.push_back(8);
40415 continue;
40416 }
40417 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40418 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40419 VPerm2Idx.push_back(Index);
40420 }
40421 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40422 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40423 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40424 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40425 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40426 return DAG.getBitcast(RootVT, Res);
40427 }
40428
40429 // If we have 3 or more shuffle instructions or a chain involving a variable
40430 // mask, we can replace them with a single PSHUFB instruction profitably.
40431 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40432 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40433 // more aggressive.
40434 if (UnaryShuffle && AllowVariablePerLaneMask &&
40435 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40436 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40437 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40438 SmallVector<SDValue, 16> PSHUFBMask;
40439 int NumBytes = RootVT.getSizeInBits() / 8;
40440 int Ratio = NumBytes / NumMaskElts;
40441 for (int i = 0; i < NumBytes; ++i) {
40442 int M = Mask[i / Ratio];
40443 if (M == SM_SentinelUndef) {
40444 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40445 continue;
40446 }
40447 if (M == SM_SentinelZero) {
40448 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40449 continue;
40450 }
40451 M = Ratio * M + i % Ratio;
40452 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40452, __extension__
__PRETTY_FUNCTION__))
;
40453 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40454 }
40455 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40456 Res = CanonicalizeShuffleInput(ByteVT, V1);
40457 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40458 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40459 return DAG.getBitcast(RootVT, Res);
40460 }
40461
40462 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40463 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40464 // slower than PSHUFB on targets that support both.
40465 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40466 Subtarget.hasXOP()) {
40467 // VPPERM Mask Operation
40468 // Bits[4:0] - Byte Index (0 - 31)
40469 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40470 SmallVector<SDValue, 16> VPPERMMask;
40471 int NumBytes = 16;
40472 int Ratio = NumBytes / NumMaskElts;
40473 for (int i = 0; i < NumBytes; ++i) {
40474 int M = Mask[i / Ratio];
40475 if (M == SM_SentinelUndef) {
40476 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40477 continue;
40478 }
40479 if (M == SM_SentinelZero) {
40480 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40481 continue;
40482 }
40483 M = Ratio * M + i % Ratio;
40484 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40485 }
40486 MVT ByteVT = MVT::v16i8;
40487 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40488 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40489 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40490 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40491 return DAG.getBitcast(RootVT, Res);
40492 }
40493
40494 // If that failed and either input is extracted then try to combine as a
40495 // shuffle with the larger type.
40496 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40497 Inputs, Root, BaseMask, Depth, HasVariableMask,
40498 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40499 return WideShuffle;
40500
40501 // If we have a dual input shuffle then lower to VPERMV3,
40502 // (non-VLX will pad to 512-bit shuffles)
40503 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40504 ((Subtarget.hasAVX512() &&
40505 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40506 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40507 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40508 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40509 MaskVT == MVT::v16i32)) ||
40510 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40511 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40512 MaskVT == MVT::v32i16)) ||
40513 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40514 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40515 MaskVT == MVT::v64i8)))) {
40516 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40517 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40518 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40519 return DAG.getBitcast(RootVT, Res);
40520 }
40521
40522 // Failed to find any combines.
40523 return SDValue();
40524}
40525
40526// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40527// instruction if possible.
40528//
40529// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40530// type size to attempt to combine:
40531// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40532// -->
40533// extract_subvector(shuffle(x,y,m2),0)
40534static SDValue combineX86ShuffleChainWithExtract(
40535 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40536 bool HasVariableMask, bool AllowVariableCrossLaneMask,
40537 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40538 const X86Subtarget &Subtarget) {
40539 unsigned NumMaskElts = BaseMask.size();
40540 unsigned NumInputs = Inputs.size();
40541 if (NumInputs == 0)
40542 return SDValue();
40543
40544 EVT RootVT = Root.getValueType();
40545 unsigned RootSizeInBits = RootVT.getSizeInBits();
40546 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40547 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40547, __extension__
__PRETTY_FUNCTION__))
;
40548
40549 // Peek through extract_subvector to find widest legal vector.
40550 // TODO: Handle ISD::TRUNCATE
40551 unsigned WideSizeInBits = RootSizeInBits;
40552 for (unsigned I = 0; I != NumInputs; ++I) {
40553 SDValue Input = peekThroughBitcasts(Inputs[I]);
40554 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
40555 Input = peekThroughBitcasts(Input.getOperand(0));
40556 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40557 WideSizeInBits < Input.getValueSizeInBits())
40558 WideSizeInBits = Input.getValueSizeInBits();
40559 }
40560
40561 // Bail if we fail to find a source larger than the existing root.
40562 unsigned Scale = WideSizeInBits / RootSizeInBits;
40563 if (WideSizeInBits <= RootSizeInBits ||
40564 (WideSizeInBits % RootSizeInBits) != 0)
40565 return SDValue();
40566
40567 // Create new mask for larger type.
40568 SmallVector<int, 64> WideMask(BaseMask);
40569 for (int &M : WideMask) {
40570 if (M < 0)
40571 continue;
40572 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40573 }
40574 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40575
40576 // Attempt to peek through inputs and adjust mask when we extract from an
40577 // upper subvector.
40578 int AdjustedMasks = 0;
40579 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
40580 for (unsigned I = 0; I != NumInputs; ++I) {
40581 SDValue &Input = WideInputs[I];
40582 Input = peekThroughBitcasts(Input);
40583 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40584 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40585 uint64_t Idx = Input.getConstantOperandVal(1);
40586 if (Idx != 0) {
40587 ++AdjustedMasks;
40588 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40589 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40590
40591 int lo = I * WideMask.size();
40592 int hi = (I + 1) * WideMask.size();
40593 for (int &M : WideMask)
40594 if (lo <= M && M < hi)
40595 M += Idx;
40596 }
40597 Input = peekThroughBitcasts(Input.getOperand(0));
40598 }
40599 }
40600
40601 // Remove unused/repeated shuffle source ops.
40602 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40603 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40603, __extension__
__PRETTY_FUNCTION__))
;
40604
40605 // Bail if we're always extracting from the lowest subvectors,
40606 // combineX86ShuffleChain should match this for the current width, or the
40607 // shuffle still references too many inputs.
40608 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40609 return SDValue();
40610
40611 // Minor canonicalization of the accumulated shuffle mask to make it easier
40612 // to match below. All this does is detect masks with sequential pairs of
40613 // elements, and shrink them to the half-width mask. It does this in a loop
40614 // so it will reduce the size of the mask to the minimal width mask which
40615 // performs an equivalent shuffle.
40616 while (WideMask.size() > 1) {
40617 SmallVector<int, 64> WidenedMask;
40618 if (!canWidenShuffleElements(WideMask, WidenedMask))
40619 break;
40620 WideMask = std::move(WidenedMask);
40621 }
40622
40623 // Canonicalization of binary shuffle masks to improve pattern matching by
40624 // commuting the inputs.
40625 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40626 ShuffleVectorSDNode::commuteMask(WideMask);
40627 std::swap(WideInputs[0], WideInputs[1]);
40628 }
40629
40630 // Increase depth for every upper subvector we've peeked through.
40631 Depth += AdjustedMasks;
40632
40633 // Attempt to combine wider chain.
40634 // TODO: Can we use a better Root?
40635 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40636 WideInputs.back().getValueSizeInBits()
40637 ? WideInputs.front()
40638 : WideInputs.back();
40639 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__))
40640 "WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__))
;
40641
40642 if (SDValue WideShuffle =
40643 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40644 HasVariableMask, AllowVariableCrossLaneMask,
40645 AllowVariablePerLaneMask, DAG, Subtarget)) {
40646 WideShuffle =
40647 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40648 return DAG.getBitcast(RootVT, WideShuffle);
40649 }
40650
40651 return SDValue();
40652}
40653
40654// Canonicalize the combined shuffle mask chain with horizontal ops.
40655// NOTE: This may update the Ops and Mask.
40656static SDValue canonicalizeShuffleMaskWithHorizOp(
40657 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
40658 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40659 const X86Subtarget &Subtarget) {
40660 if (Mask.empty() || Ops.empty())
40661 return SDValue();
40662
40663 SmallVector<SDValue> BC;
40664 for (SDValue Op : Ops)
40665 BC.push_back(peekThroughBitcasts(Op));
40666
40667 // All ops must be the same horizop + type.
40668 SDValue BC0 = BC[0];
40669 EVT VT0 = BC0.getValueType();
40670 unsigned Opcode0 = BC0.getOpcode();
40671 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40672 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40673 }))
40674 return SDValue();
40675
40676 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40677 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40678 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40679 if (!isHoriz && !isPack)
40680 return SDValue();
40681
40682 // Do all ops have a single use?
40683 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40684 return Op.hasOneUse() &&
40685 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
40686 });
40687
40688 int NumElts = VT0.getVectorNumElements();
40689 int NumLanes = VT0.getSizeInBits() / 128;
40690 int NumEltsPerLane = NumElts / NumLanes;
40691 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40692 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40693 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40694
40695 if (NumEltsPerLane >= 4 &&
40696 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40697 SmallVector<int> LaneMask, ScaledMask;
40698 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40699 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40700 // See if we can remove the shuffle by resorting the HOP chain so that
40701 // the HOP args are pre-shuffled.
40702 // TODO: Generalize to any sized/depth chain.
40703 // TODO: Add support for PACKSS/PACKUS.
40704 if (isHoriz) {
40705 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40706 auto GetHOpSrc = [&](int M) {
40707 if (M == SM_SentinelUndef)
40708 return DAG.getUNDEF(VT0);
40709 if (M == SM_SentinelZero)
40710 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40711 SDValue Src0 = BC[M / 4];
40712 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40713 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40714 return Src1.getOperand(M % 2);
40715 return SDValue();
40716 };
40717 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40718 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40719 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40720 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40721 if (M0 && M1 && M2 && M3) {
40722 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40723 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40724 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40725 }
40726 }
40727 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40728 if (Ops.size() >= 2) {
40729 SDValue LHS, RHS;
40730 auto GetHOpSrc = [&](int M, int &OutM) {
40731 // TODO: Support SM_SentinelZero
40732 if (M < 0)
40733 return M == SM_SentinelUndef;
40734 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40735 if (!LHS || LHS == Src) {
40736 LHS = Src;
40737 OutM = (M % 2);
40738 return true;
40739 }
40740 if (!RHS || RHS == Src) {
40741 RHS = Src;
40742 OutM = (M % 2) + 2;
40743 return true;
40744 }
40745 return false;
40746 };
40747 int PostMask[4] = {-1, -1, -1, -1};
40748 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40749 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40750 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40751 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40752 LHS = DAG.getBitcast(SrcVT, LHS);
40753 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40754 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40755 // Use SHUFPS for the permute so this will work on SSE3 targets,
40756 // shuffle combining and domain handling will simplify this later on.
40757 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40758 Res = DAG.getBitcast(ShuffleVT, Res);
40759 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40760 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40761 }
40762 }
40763 }
40764 }
40765
40766 if (2 < Ops.size())
40767 return SDValue();
40768
40769 SDValue BC1 = BC[BC.size() - 1];
40770 if (Mask.size() == VT0.getVectorNumElements()) {
40771 // Canonicalize binary shuffles of horizontal ops that use the
40772 // same sources to an unary shuffle.
40773 // TODO: Try to perform this fold even if the shuffle remains.
40774 if (Ops.size() == 2) {
40775 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40776 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40777 };
40778 // Commute if all BC0's ops are contained in BC1.
40779 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40780 ContainsOps(BC1, BC0.getOperand(1))) {
40781 ShuffleVectorSDNode::commuteMask(Mask);
40782 std::swap(Ops[0], Ops[1]);
40783 std::swap(BC0, BC1);
40784 }
40785
40786 // If BC1 can be represented by BC0, then convert to unary shuffle.
40787 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40788 ContainsOps(BC0, BC1.getOperand(1))) {
40789 for (int &M : Mask) {
40790 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40791 continue;
40792 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40793 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40794 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40795 M += NumHalfEltsPerLane;
40796 }
40797 }
40798 }
40799
40800 // Canonicalize unary horizontal ops to only refer to lower halves.
40801 for (int i = 0; i != NumElts; ++i) {
40802 int &M = Mask[i];
40803 if (isUndefOrZero(M))
40804 continue;
40805 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40806 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40807 M -= NumHalfEltsPerLane;
40808 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40809 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40810 M -= NumHalfEltsPerLane;
40811 }
40812 }
40813
40814 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40815 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40816 // represents the LHS/RHS inputs for the lower/upper halves.
40817 SmallVector<int, 16> TargetMask128, WideMask128;
40818 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40819 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40820 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40820, __extension__
__PRETTY_FUNCTION__))
;
40821 bool SingleOp = (Ops.size() == 1);
40822 if (isPack || OneUseOps ||
40823 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40824 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40825 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40826 Lo = Lo.getOperand(WideMask128[0] & 1);
40827 Hi = Hi.getOperand(WideMask128[1] & 1);
40828 if (SingleOp) {
40829 SDValue Undef = DAG.getUNDEF(SrcVT);
40830 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40831 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40832 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40833 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40834 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40835 }
40836 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40837 }
40838 }
40839
40840 return SDValue();
40841}
40842
40843// Attempt to constant fold all of the constant source ops.
40844// Returns true if the entire shuffle is folded to a constant.
40845// TODO: Extend this to merge multiple constant Ops and update the mask.
40846static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
40847 ArrayRef<int> Mask, SDValue Root,
40848 bool HasVariableMask,
40849 SelectionDAG &DAG,
40850 const X86Subtarget &Subtarget) {
40851 MVT VT = Root.getSimpleValueType();
40852
40853 unsigned SizeInBits = VT.getSizeInBits();
40854 unsigned NumMaskElts = Mask.size();
40855 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
40856 unsigned NumOps = Ops.size();
40857
40858 // Extract constant bits from each source op.
40859 SmallVector<APInt, 16> UndefEltsOps(NumOps);
40860 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
40861 for (unsigned I = 0; I != NumOps; ++I)
40862 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
40863 RawBitsOps[I]))
40864 return SDValue();
40865
40866 // If we're optimizing for size, only fold if at least one of the constants is
40867 // only used once or the combined shuffle has included a variable mask
40868 // shuffle, this is to avoid constant pool bloat.
40869 bool IsOptimizingSize = DAG.shouldOptForSize();
40870 if (IsOptimizingSize && !HasVariableMask &&
40871 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40872 return SDValue();
40873
40874 // Shuffle the constant bits according to the mask.
40875 SDLoc DL(Root);
40876 APInt UndefElts(NumMaskElts, 0);
40877 APInt ZeroElts(NumMaskElts, 0);
40878 APInt ConstantElts(NumMaskElts, 0);
40879 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
40880 APInt::getZero(MaskSizeInBits));
40881 for (unsigned i = 0; i != NumMaskElts; ++i) {
40882 int M = Mask[i];
40883 if (M == SM_SentinelUndef) {
40884 UndefElts.setBit(i);
40885 continue;
40886 } else if (M == SM_SentinelZero) {
40887 ZeroElts.setBit(i);
40888 continue;
40889 }
40890 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40890, __extension__
__PRETTY_FUNCTION__))
;
40891
40892 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
40893 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
40894
40895 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
40896 if (SrcUndefElts[SrcMaskIdx]) {
40897 UndefElts.setBit(i);
40898 continue;
40899 }
40900
40901 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
40902 APInt &Bits = SrcEltBits[SrcMaskIdx];
40903 if (!Bits) {
40904 ZeroElts.setBit(i);
40905 continue;
40906 }
40907
40908 ConstantElts.setBit(i);
40909 ConstantBitData[i] = Bits;
40910 }
40911 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40911, __extension__
__PRETTY_FUNCTION__))
;
40912
40913 // Attempt to create a zero vector.
40914 if ((UndefElts | ZeroElts).isAllOnes())
40915 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
40916
40917 // Create the constant data.
40918 MVT MaskSVT;
40919 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
40920 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
40921 else
40922 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
40923
40924 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
40925 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40926 return SDValue();
40927
40928 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
40929 return DAG.getBitcast(VT, CstOp);
40930}
40931
40932namespace llvm {
40933 namespace X86 {
40934 enum {
40935 MaxShuffleCombineDepth = 8
40936 };
40937 }
40938} // namespace llvm
40939
40940/// Fully generic combining of x86 shuffle instructions.
40941///
40942/// This should be the last combine run over the x86 shuffle instructions. Once
40943/// they have been fully optimized, this will recursively consider all chains
40944/// of single-use shuffle instructions, build a generic model of the cumulative
40945/// shuffle operation, and check for simpler instructions which implement this
40946/// operation. We use this primarily for two purposes:
40947///
40948/// 1) Collapse generic shuffles to specialized single instructions when
40949/// equivalent. In most cases, this is just an encoding size win, but
40950/// sometimes we will collapse multiple generic shuffles into a single
40951/// special-purpose shuffle.
40952/// 2) Look for sequences of shuffle instructions with 3 or more total
40953/// instructions, and replace them with the slightly more expensive SSSE3
40954/// PSHUFB instruction if available. We do this as the last combining step
40955/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40956/// a suitable short sequence of other instructions. The PSHUFB will either
40957/// use a register or have to read from memory and so is slightly (but only
40958/// slightly) more expensive than the other shuffle instructions.
40959///
40960/// Because this is inherently a quadratic operation (for each shuffle in
40961/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40962/// This should never be an issue in practice as the shuffle lowering doesn't
40963/// produce sequences of more than 8 instructions.
40964///
40965/// FIXME: We will currently miss some cases where the redundant shuffling
40966/// would simplify under the threshold for PSHUFB formation because of
40967/// combine-ordering. To fix this, we should do the redundant instruction
40968/// combining in this recursive walk.
40969static SDValue combineX86ShufflesRecursively(
40970 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40971 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40972 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40973 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40974 const X86Subtarget &Subtarget) {
40975 assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))
40976 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))
40977 "Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))
;
40978 MVT RootVT = Root.getSimpleValueType();
40979 assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40979, __extension__
__PRETTY_FUNCTION__))
;
40980 unsigned RootSizeInBits = RootVT.getSizeInBits();
40981
40982 // Bound the depth of our recursive combine because this is ultimately
40983 // quadratic in nature.
40984 if (Depth >= MaxDepth)
40985 return SDValue();
40986
40987 // Directly rip through bitcasts to find the underlying operand.
40988 SDValue Op = SrcOps[SrcOpIndex];
40989 Op = peekThroughOneUseBitcasts(Op);
40990
40991 EVT VT = Op.getValueType();
40992 if (!VT.isVector() || !VT.isSimple())
40993 return SDValue(); // Bail if we hit a non-simple non-vector.
40994
40995 // FIXME: Just bail on f16 for now.
40996 if (VT.getVectorElementType() == MVT::f16)
40997 return SDValue();
40998
40999 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__
__PRETTY_FUNCTION__))
41000 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__
__PRETTY_FUNCTION__))
;
41001
41002 // Create a demanded elts mask from the referenced elements of Op.
41003 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41004 for (int M : RootMask) {
41005 int BaseIdx = RootMask.size() * SrcOpIndex;
41006 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41007 OpDemandedElts.setBit(M - BaseIdx);
41008 }
41009 if (RootSizeInBits != VT.getSizeInBits()) {
41010 // Op is smaller than Root - extract the demanded elts for the subvector.
41011 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41012 unsigned NumOpMaskElts = RootMask.size() / Scale;
41013 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41013, __extension__
__PRETTY_FUNCTION__))
;
41014 assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
41015 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
41016 .isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
41017 "Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))
;
41018 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41019 }
41020 OpDemandedElts =
41021 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41022
41023 // Extract target shuffle mask and resolve sentinels and inputs.
41024 SmallVector<int, 64> OpMask;
41025 SmallVector<SDValue, 2> OpInputs;
41026 APInt OpUndef, OpZero;
41027 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
41028 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41029 OpZero, DAG, Depth, false)) {
41030 // Shuffle inputs must not be larger than the shuffle result.
41031 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41032 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41033 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41034 }))
41035 return SDValue();
41036 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41037 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41038 !isNullConstant(Op.getOperand(1))) {
41039 SDValue SrcVec = Op.getOperand(0);
41040 int ExtractIdx = Op.getConstantOperandVal(1);
41041 unsigned NumElts = VT.getVectorNumElements();
41042 OpInputs.assign({SrcVec});
41043 OpMask.assign(NumElts, SM_SentinelUndef);
41044 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41045 OpZero = OpUndef = APInt::getZero(NumElts);
41046 } else {
41047 return SDValue();
41048 }
41049
41050 // If the shuffle result was smaller than the root, we need to adjust the
41051 // mask indices and pad the mask with undefs.
41052 if (RootSizeInBits > VT.getSizeInBits()) {
41053 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41054 unsigned OpMaskSize = OpMask.size();
41055 if (OpInputs.size() > 1) {
41056 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41057 for (int &M : OpMask) {
41058 if (M < 0)
41059 continue;
41060 int EltIdx = M % OpMaskSize;
41061 int OpIdx = M / OpMaskSize;
41062 M = (PaddedMaskSize * OpIdx) + EltIdx;
41063 }
41064 }
41065 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41066 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41067 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41068 }
41069
41070 SmallVector<int, 64> Mask;
41071 SmallVector<SDValue, 16> Ops;
41072
41073 // We don't need to merge masks if the root is empty.
41074 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41075 if (EmptyRoot) {
41076 // Only resolve zeros if it will remove an input, otherwise we might end
41077 // up in an infinite loop.
41078 bool ResolveKnownZeros = true;
41079 if (!OpZero.isZero()) {
41080 APInt UsedInputs = APInt::getZero(OpInputs.size());
41081 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41082 int M = OpMask[i];
41083 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41084 continue;
41085 UsedInputs.setBit(M / OpMask.size());
41086 if (UsedInputs.isAllOnes()) {
41087 ResolveKnownZeros = false;
41088 break;
41089 }
41090 }
41091 }
41092 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41093 ResolveKnownZeros);
41094
41095 Mask = OpMask;
41096 Ops.append(OpInputs.begin(), OpInputs.end());
41097 } else {
41098 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41099
41100 // Add the inputs to the Ops list, avoiding duplicates.
41101 Ops.append(SrcOps.begin(), SrcOps.end());
41102
41103 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41104 // Attempt to find an existing match.
41105 SDValue InputBC = peekThroughBitcasts(Input);
41106 for (int i = 0, e = Ops.size(); i < e; ++i)
41107 if (InputBC == peekThroughBitcasts(Ops[i]))
41108 return i;
41109 // Match failed - should we replace an existing Op?
41110 if (InsertionPoint >= 0) {
41111 Ops[InsertionPoint] = Input;
41112 return InsertionPoint;
41113 }
41114 // Add to the end of the Ops list.
41115 Ops.push_back(Input);
41116 return Ops.size() - 1;
41117 };
41118
41119 SmallVector<int, 2> OpInputIdx;
41120 for (SDValue OpInput : OpInputs)
41121 OpInputIdx.push_back(
41122 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41123
41124 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41125 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41126 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41127 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41128 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
41129 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))
;
41130
41131 // This function can be performance-critical, so we rely on the power-of-2
41132 // knowledge that we have about the mask sizes to replace div/rem ops with
41133 // bit-masks and shifts.
41134 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__
__PRETTY_FUNCTION__))
41135 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__
__PRETTY_FUNCTION__))
;
41136 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__
__PRETTY_FUNCTION__))
41137 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__
__PRETTY_FUNCTION__))
;
41138 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41139 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41140
41141 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41142 unsigned RootRatio =
41143 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41144 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41145 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__
__PRETTY_FUNCTION__))
41146 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__
__PRETTY_FUNCTION__))
;
41147
41148 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41148, __extension__
__PRETTY_FUNCTION__))
;
41149 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41149, __extension__
__PRETTY_FUNCTION__))
;
41150 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41150, __extension__
__PRETTY_FUNCTION__))
;
41151 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41152 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41153
41154 Mask.resize(MaskWidth, SM_SentinelUndef);
41155
41156 // Merge this shuffle operation's mask into our accumulated mask. Note that
41157 // this shuffle's mask will be the first applied to the input, followed by
41158 // the root mask to get us all the way to the root value arrangement. The
41159 // reason for this order is that we are recursing up the operation chain.
41160 for (unsigned i = 0; i < MaskWidth; ++i) {
41161 unsigned RootIdx = i >> RootRatioLog2;
41162 if (RootMask[RootIdx] < 0) {
41163 // This is a zero or undef lane, we're done.
41164 Mask[i] = RootMask[RootIdx];
41165 continue;
41166 }
41167
41168 unsigned RootMaskedIdx =
41169 RootRatio == 1
41170 ? RootMask[RootIdx]
41171 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41172
41173 // Just insert the scaled root mask value if it references an input other
41174 // than the SrcOp we're currently inserting.
41175 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41176 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41177 Mask[i] = RootMaskedIdx;
41178 continue;
41179 }
41180
41181 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41182 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41183 if (OpMask[OpIdx] < 0) {
41184 // The incoming lanes are zero or undef, it doesn't matter which ones we
41185 // are using.
41186 Mask[i] = OpMask[OpIdx];
41187 continue;
41188 }
41189
41190 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41191 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41192 : (OpMask[OpIdx] << OpRatioLog2) +
41193 (RootMaskedIdx & (OpRatio - 1));
41194
41195 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41196 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41197 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41197, __extension__
__PRETTY_FUNCTION__))
;
41198 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41199
41200 Mask[i] = OpMaskedIdx;
41201 }
41202 }
41203
41204 // Peek through vector widenings and set out of bounds mask indices to undef.
41205 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41206 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
41207 SDValue &Op = Ops[I];
41208 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
41209 isNullConstant(Op.getOperand(2))) {
41210 Op = Op.getOperand(1);
41211 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41212 int Lo = I * Mask.size();
41213 int Hi = (I + 1) * Mask.size();
41214 int NewHi = Lo + (Mask.size() / Scale);
41215 for (int &M : Mask) {
41216 if (Lo <= M && NewHi <= M && M < Hi)
41217 M = SM_SentinelUndef;
41218 }
41219 }
41220 }
41221
41222 // Peek through any free extract_subvector nodes back to root size.
41223 for (SDValue &Op : Ops)
41224 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41225 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41226 isNullConstant(Op.getOperand(1)))
41227 Op = Op.getOperand(0);
41228
41229 // Remove unused/repeated shuffle source ops.
41230 resolveTargetShuffleInputsAndMask(Ops, Mask);
41231
41232 // Handle the all undef/zero/ones cases early.
41233 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41234 return DAG.getUNDEF(RootVT);
41235 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41236 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
41237 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41238 !llvm::is_contained(Mask, SM_SentinelZero))
41239 return getOnesVector(RootVT, DAG, SDLoc(Root));
41240
41241 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41241, __extension__
__PRETTY_FUNCTION__))
;
41242 HasVariableMask |= IsOpVariableMask;
41243
41244 // Update the list of shuffle nodes that have been combined so far.
41245 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
41246 SrcNodes.end());
41247 CombinedNodes.push_back(Op.getNode());
41248
41249 // See if we can recurse into each shuffle source op (if it's a target
41250 // shuffle). The source op should only be generally combined if it either has
41251 // a single use (i.e. current Op) or all its users have already been combined,
41252 // if not then we can still combine but should prevent generation of variable
41253 // shuffles to avoid constant pool bloat.
41254 // Don't recurse if we already have more source ops than we can combine in
41255 // the remaining recursion depth.
41256 if (Ops.size() < (MaxDepth - Depth)) {
41257 for (int i = 0, e = Ops.size(); i < e; ++i) {
41258 // For empty roots, we need to resolve zeroable elements before combining
41259 // them with other shuffles.
41260 SmallVector<int, 64> ResolvedMask = Mask;
41261 if (EmptyRoot)
41262 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41263 bool AllowCrossLaneVar = false;
41264 bool AllowPerLaneVar = false;
41265 if (Ops[i].getNode()->hasOneUse() ||
41266 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41267 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41268 AllowPerLaneVar = AllowVariablePerLaneMask;
41269 }
41270 if (SDValue Res = combineX86ShufflesRecursively(
41271 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41272 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41273 Subtarget))
41274 return Res;
41275 }
41276 }
41277
41278 // Attempt to constant fold all of the constant source ops.
41279 if (SDValue Cst = combineX86ShufflesConstants(
41280 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
41281 return Cst;
41282
41283 // If constant fold failed and we only have constants - then we have
41284 // multiple uses by a single non-variable shuffle - just bail.
41285 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41286 APInt UndefElts;
41287 SmallVector<APInt> RawBits;
41288 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41289 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41290 RawBits);
41291 })) {
41292 return SDValue();
41293 }
41294
41295 // Canonicalize the combined shuffle mask chain with horizontal ops.
41296 // NOTE: This will update the Ops and Mask.
41297 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
41298 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
41299 return DAG.getBitcast(RootVT, HOp);
41300
41301 // Try to refine our inputs given our knowledge of target shuffle mask.
41302 for (auto I : enumerate(Ops)) {
41303 int OpIdx = I.index();
41304 SDValue &Op = I.value();
41305
41306 // What range of shuffle mask element values results in picking from Op?
41307 int Lo = OpIdx * Mask.size();
41308 int Hi = Lo + Mask.size();
41309
41310 // Which elements of Op do we demand, given the mask's granularity?
41311 APInt OpDemandedElts(Mask.size(), 0);
41312 for (int MaskElt : Mask) {
41313 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41314 int OpEltIdx = MaskElt - Lo;
41315 OpDemandedElts.setBit(OpEltIdx);
41316 }
41317 }
41318
41319 // Is the shuffle result smaller than the root?
41320 if (Op.getValueSizeInBits() < RootSizeInBits) {
41321 // We padded the mask with undefs. But we now need to undo that.
41322 unsigned NumExpectedVectorElts = Mask.size();
41323 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41324 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41325 assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))
41326 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))
41327 "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))
;
41328 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41329 }
41330
41331 // The Op itself may be of different VT, so we need to scale the mask.
41332 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41333 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41334
41335 // Can this operand be simplified any further, given it's demanded elements?
41336 if (SDValue NewOp =
41337 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
41338 Op, OpScaledDemandedElts, DAG))
41339 Op = NewOp;
41340 }
41341 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41342
41343 // Widen any subvector shuffle inputs we've collected.
41344 // TODO: Remove this to avoid generating temporary nodes, we should only
41345 // widen once combineX86ShuffleChain has found a match.
41346 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41347 return Op.getValueSizeInBits() < RootSizeInBits;
41348 })) {
41349 for (SDValue &Op : Ops)
41350 if (Op.getValueSizeInBits() < RootSizeInBits)
41351 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41352 RootSizeInBits);
41353 // Reresolve - we might have repeated subvector sources.
41354 resolveTargetShuffleInputsAndMask(Ops, Mask);
41355 }
41356
41357 // We can only combine unary and binary shuffle mask cases.
41358 if (Ops.size() <= 2) {
41359 // Minor canonicalization of the accumulated shuffle mask to make it easier
41360 // to match below. All this does is detect masks with sequential pairs of
41361 // elements, and shrink them to the half-width mask. It does this in a loop
41362 // so it will reduce the size of the mask to the minimal width mask which
41363 // performs an equivalent shuffle.
41364 while (Mask.size() > 1) {
41365 SmallVector<int, 64> WidenedMask;
41366 if (!canWidenShuffleElements(Mask, WidenedMask))
41367 break;
41368 Mask = std::move(WidenedMask);
41369 }
41370
41371 // Canonicalization of binary shuffle masks to improve pattern matching by
41372 // commuting the inputs.
41373 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41374 ShuffleVectorSDNode::commuteMask(Mask);
41375 std::swap(Ops[0], Ops[1]);
41376 }
41377
41378 // Try to combine into a single shuffle instruction.
41379 if (SDValue Shuffle = combineX86ShuffleChain(
41380 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41381 AllowVariablePerLaneMask, DAG, Subtarget))
41382 return Shuffle;
41383
41384 // If all the operands come from the same larger vector, fallthrough and try
41385 // to use combineX86ShuffleChainWithExtract.
41386 SDValue LHS = peekThroughBitcasts(Ops.front());
41387 SDValue RHS = peekThroughBitcasts(Ops.back());
41388 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41389 (RootSizeInBits / Mask.size()) != 64 ||
41390 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41391 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41392 LHS.getOperand(0) != RHS.getOperand(0))
41393 return SDValue();
41394 }
41395
41396 // If that failed and any input is extracted then try to combine as a
41397 // shuffle with the larger type.
41398 return combineX86ShuffleChainWithExtract(
41399 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41400 AllowVariablePerLaneMask, DAG, Subtarget);
41401}
41402
41403/// Helper entry wrapper to combineX86ShufflesRecursively.
41404static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
41405 const X86Subtarget &Subtarget) {
41406 return combineX86ShufflesRecursively(
41407 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41408 /*HasVarMask*/ false,
41409 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41410 Subtarget);
41411}
41412
41413/// Get the PSHUF-style mask from PSHUF node.
41414///
41415/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41416/// PSHUF-style masks that can be reused with such instructions.
41417static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
41418 MVT VT = N.getSimpleValueType();
41419 SmallVector<int, 4> Mask;
41420 SmallVector<SDValue, 2> Ops;
41421 bool HaveMask =
41422 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
41423 (void)HaveMask;
41424 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41424
, __extension__ __PRETTY_FUNCTION__))
;
41425
41426 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41427 // matter. Check that the upper masks are repeats and remove them.
41428 if (VT.getSizeInBits() > 128) {
41429 int LaneElts = 128 / VT.getScalarSizeInBits();
41430#ifndef NDEBUG
41431 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41432 for (int j = 0; j < LaneElts; ++j)
41433 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__
__PRETTY_FUNCTION__))
41434 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__
__PRETTY_FUNCTION__))
;
41435#endif
41436 Mask.resize(LaneElts);
41437 }
41438
41439 switch (N.getOpcode()) {
41440 case X86ISD::PSHUFD:
41441 return Mask;
41442 case X86ISD::PSHUFLW:
41443 Mask.resize(4);
41444 return Mask;
41445 case X86ISD::PSHUFHW:
41446 Mask.erase(Mask.begin(), Mask.begin() + 4);
41447 for (int &M : Mask)
41448 M -= 4;
41449 return Mask;
41450 default:
41451 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41451)
;
41452 }
41453}
41454
41455/// Search for a combinable shuffle across a chain ending in pshufd.
41456///
41457/// We walk up the chain and look for a combinable shuffle, skipping over
41458/// shuffles that we could hoist this shuffle's transformation past without
41459/// altering anything.
41460static SDValue
41461combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
41462 SelectionDAG &DAG) {
41463 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__
__PRETTY_FUNCTION__))
41464 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__
__PRETTY_FUNCTION__))
;
41465 SDLoc DL(N);
41466
41467 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41468 // of the shuffles in the chain so that we can form a fresh chain to replace
41469 // this one.
41470 SmallVector<SDValue, 8> Chain;
41471 SDValue V = N.getOperand(0);
41472 for (; V.hasOneUse(); V = V.getOperand(0)) {
41473 switch (V.getOpcode()) {
41474 default:
41475 return SDValue(); // Nothing combined!
41476
41477 case ISD::BITCAST:
41478 // Skip bitcasts as we always know the type for the target specific
41479 // instructions.
41480 continue;
41481
41482 case X86ISD::PSHUFD:
41483 // Found another dword shuffle.
41484 break;
41485
41486 case X86ISD::PSHUFLW:
41487 // Check that the low words (being shuffled) are the identity in the
41488 // dword shuffle, and the high words are self-contained.
41489 if (Mask[0] != 0 || Mask[1] != 1 ||
41490 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41491 return SDValue();
41492
41493 Chain.push_back(V);
41494 continue;
41495
41496 case X86ISD::PSHUFHW:
41497 // Check that the high words (being shuffled) are the identity in the
41498 // dword shuffle, and the low words are self-contained.
41499 if (Mask[2] != 2 || Mask[3] != 3 ||
41500 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41501 return SDValue();
41502
41503 Chain.push_back(V);
41504 continue;
41505
41506 case X86ISD::UNPCKL:
41507 case X86ISD::UNPCKH:
41508 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41509 // shuffle into a preceding word shuffle.
41510 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41511 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41512 return SDValue();
41513
41514 // Search for a half-shuffle which we can combine with.
41515 unsigned CombineOp =
41516 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41517 if (V.getOperand(0) != V.getOperand(1) ||
41518 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41519 return SDValue();
41520 Chain.push_back(V);
41521 V = V.getOperand(0);
41522 do {
41523 switch (V.getOpcode()) {
41524 default:
41525 return SDValue(); // Nothing to combine.
41526
41527 case X86ISD::PSHUFLW:
41528 case X86ISD::PSHUFHW:
41529 if (V.getOpcode() == CombineOp)
41530 break;
41531
41532 Chain.push_back(V);
41533
41534 [[fallthrough]];
41535 case ISD::BITCAST:
41536 V = V.getOperand(0);
41537 continue;
41538 }
41539 break;
41540 } while (V.hasOneUse());
41541 break;
41542 }
41543 // Break out of the loop if we break out of the switch.
41544 break;
41545 }
41546
41547 if (!V.hasOneUse())
41548 // We fell out of the loop without finding a viable combining instruction.
41549 return SDValue();
41550
41551 // Merge this node's mask and our incoming mask.
41552 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
41553 for (int &M : Mask)
41554 M = VMask[M];
41555 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41556 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41557
41558 // Rebuild the chain around this new shuffle.
41559 while (!Chain.empty()) {
41560 SDValue W = Chain.pop_back_val();
41561
41562 if (V.getValueType() != W.getOperand(0).getValueType())
41563 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41564
41565 switch (W.getOpcode()) {
41566 default:
41567 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41567)
;
41568
41569 case X86ISD::UNPCKL:
41570 case X86ISD::UNPCKH:
41571 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41572 break;
41573
41574 case X86ISD::PSHUFD:
41575 case X86ISD::PSHUFLW:
41576 case X86ISD::PSHUFHW:
41577 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41578 break;
41579 }
41580 }
41581 if (V.getValueType() != N.getValueType())
41582 V = DAG.getBitcast(N.getValueType(), V);
41583
41584 // Return the new chain to replace N.
41585 return V;
41586}
41587
41588// Attempt to commute shufps LHS loads:
41589// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41590static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
41591 SelectionDAG &DAG) {
41592 // TODO: Add vXf64 support.
41593 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41594 return SDValue();
41595
41596 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41597 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41598 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41599 return SDValue();
41600 SDValue N0 = V.getOperand(0);
41601 SDValue N1 = V.getOperand(1);
41602 unsigned Imm = V.getConstantOperandVal(2);
41603 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41604 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41605 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
41606 return SDValue();
41607 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41608 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41609 DAG.getTargetConstant(Imm, DL, MVT::i8));
41610 };
41611
41612 switch (N.getOpcode()) {
41613 case X86ISD::VPERMILPI:
41614 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41615 unsigned Imm = N.getConstantOperandVal(1);
41616 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41617 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41618 }
41619 break;
41620 case X86ISD::SHUFP: {
41621 SDValue N0 = N.getOperand(0);
41622 SDValue N1 = N.getOperand(1);
41623 unsigned Imm = N.getConstantOperandVal(2);
41624 if (N0 == N1) {
41625 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41626 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41627 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41628 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41629 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41630 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41631 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41632 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41633 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41634 }
41635 break;
41636 }
41637 }
41638
41639 return SDValue();
41640}
41641
41642// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41643static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
41644 const SDLoc &DL) {
41645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41646 EVT ShuffleVT = N.getValueType();
41647
41648 auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
41649 // AllZeros/AllOnes constants are freely shuffled and will peek through
41650 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41651 // merge with target shuffles if it has one use so shuffle combining is
41652 // likely to kick in. Shuffles of splats are expected to be removed.
41653 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41654 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41655 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
41656 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
41657 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41658 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41659 (FoldLoad && isShuffleFoldableLoad(Op)) ||
41660 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41661 };
41662 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41663 // Ensure we only shuffle whole vector src elements, unless its a logical
41664 // binops where we can more aggressively move shuffles from dst to src.
41665 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
41666 BinOp == X86ISD::ANDNP ||
41667 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41668 };
41669
41670 unsigned Opc = N.getOpcode();
41671 switch (Opc) {
41672 // Unary and Unary+Permute Shuffles.
41673 case X86ISD::PSHUFB: {
41674 // Don't merge PSHUFB if it contains zero'd elements.
41675 SmallVector<int> Mask;
41676 SmallVector<SDValue> Ops;
41677 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
41678 Mask))
41679 break;
41680 [[fallthrough]];
41681 }
41682 case X86ISD::VBROADCAST:
41683 case X86ISD::MOVDDUP:
41684 case X86ISD::PSHUFD:
41685 case X86ISD::PSHUFHW:
41686 case X86ISD::PSHUFLW:
41687 case X86ISD::VPERMI:
41688 case X86ISD::VPERMILPI: {
41689 if (N.getOperand(0).getValueType() == ShuffleVT &&
41690 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41691 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41692 unsigned SrcOpcode = N0.getOpcode();
41693 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41694 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41695 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41696 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
41697 IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
41698 SDValue LHS, RHS;
41699 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41700 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41701 if (N.getNumOperands() == 2) {
41702 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41703 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41704 } else {
41705 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41706 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41707 }
41708 EVT OpVT = N0.getValueType();
41709 return DAG.getBitcast(ShuffleVT,
41710 DAG.getNode(SrcOpcode, DL, OpVT,
41711 DAG.getBitcast(OpVT, LHS),
41712 DAG.getBitcast(OpVT, RHS)));
41713 }
41714 }
41715 }
41716 break;
41717 }
41718 // Binary and Binary+Permute Shuffles.
41719 case X86ISD::INSERTPS: {
41720 // Don't merge INSERTPS if it contains zero'd elements.
41721 unsigned InsertPSMask = N.getConstantOperandVal(2);
41722 unsigned ZeroMask = InsertPSMask & 0xF;
41723 if (ZeroMask != 0)
41724 break;
41725 [[fallthrough]];
41726 }
41727 case X86ISD::MOVSD:
41728 case X86ISD::MOVSS:
41729 case X86ISD::BLENDI:
41730 case X86ISD::SHUFP:
41731 case X86ISD::UNPCKH:
41732 case X86ISD::UNPCKL: {
41733 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41734 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41735 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41736 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41737 unsigned SrcOpcode = N0.getOpcode();
41738 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41739 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41740 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41741 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41742 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
41743 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41744 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
41745 // Ensure the total number of shuffles doesn't increase by folding this
41746 // shuffle through to the source ops.
41747 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41748 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41749 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41750 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41751 SDValue LHS, RHS;
41752 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41753 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41754 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41755 Op11 = DAG.getBitcast(ShuffleVT, Op11);
41756 if (N.getNumOperands() == 3) {
41757 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41758 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41759 } else {
41760 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41761 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41762 }
41763 EVT OpVT = N0.getValueType();
41764 return DAG.getBitcast(ShuffleVT,
41765 DAG.getNode(SrcOpcode, DL, OpVT,
41766 DAG.getBitcast(OpVT, LHS),
41767 DAG.getBitcast(OpVT, RHS)));
41768 }
41769 }
41770 }
41771 break;
41772 }
41773 }
41774 return SDValue();
41775}
41776
41777/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41778static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
41779 SelectionDAG &DAG,
41780 const SDLoc &DL) {
41781 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41781, __extension__
__PRETTY_FUNCTION__))
;
41782
41783 MVT VT = V.getSimpleValueType();
41784 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41785 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41786 unsigned SrcOpc0 = Src0.getOpcode();
41787 unsigned SrcOpc1 = Src1.getOpcode();
41788 EVT SrcVT0 = Src0.getValueType();
41789 EVT SrcVT1 = Src1.getValueType();
41790
41791 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41792 return SDValue();
41793
41794 switch (SrcOpc0) {
41795 case X86ISD::MOVDDUP: {
41796 SDValue LHS = Src0.getOperand(0);
41797 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41798 SDValue Res =
41799 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41800 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41801 return DAG.getBitcast(VT, Res);
41802 }
41803 case X86ISD::VPERMILPI:
41804 // TODO: Handle v4f64 permutes with different low/high lane masks.
41805 if (SrcVT0 == MVT::v4f64) {
41806 uint64_t Mask = Src0.getConstantOperandVal(1);
41807 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41808 break;
41809 }
41810 [[fallthrough]];
41811 case X86ISD::VSHLI:
41812 case X86ISD::VSRLI:
41813 case X86ISD::VSRAI:
41814 case X86ISD::PSHUFD:
41815 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41816 SDValue LHS = Src0.getOperand(0);
41817 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41818 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41819 V.getOperand(2));
41820 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41821 return DAG.getBitcast(VT, Res);
41822 }
41823 break;
41824 }
41825
41826 return SDValue();
41827}
41828
41829/// Try to combine x86 target specific shuffles.
41830static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
41831 TargetLowering::DAGCombinerInfo &DCI,
41832 const X86Subtarget &Subtarget) {
41833 SDLoc DL(N);
41834 MVT VT = N.getSimpleValueType();
41835 SmallVector<int, 4> Mask;
41836 unsigned Opcode = N.getOpcode();
41837
41838 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
41839 return R;
41840
41841 // Handle specific target shuffles.
41842 switch (Opcode) {
41843 case X86ISD::MOVDDUP: {
41844 SDValue Src = N.getOperand(0);
41845 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41846 if (VT == MVT::v2f64 && Src.hasOneUse() &&
41847 ISD::isNormalLoad(Src.getNode())) {
41848 LoadSDNode *LN = cast<LoadSDNode>(Src);
41849 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
41850 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
41851 DCI.CombineTo(N.getNode(), Movddup);
41852 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41853 DCI.recursivelyDeleteUnusedNodes(LN);
41854 return N; // Return N so it doesn't get rechecked!
41855 }
41856 }
41857
41858 return SDValue();
41859 }
41860 case X86ISD::VBROADCAST: {
41861 SDValue Src = N.getOperand(0);
41862 SDValue BC = peekThroughBitcasts(Src);
41863 EVT SrcVT = Src.getValueType();
41864 EVT BCVT = BC.getValueType();
41865
41866 // If broadcasting from another shuffle, attempt to simplify it.
41867 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41868 if (isTargetShuffle(BC.getOpcode()) &&
41869 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
41870 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
41871 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
41872 SM_SentinelUndef);
41873 for (unsigned i = 0; i != Scale; ++i)
41874 DemandedMask[i] = i;
41875 if (SDValue Res = combineX86ShufflesRecursively(
41876 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41877 X86::MaxShuffleCombineDepth,
41878 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
41879 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
41880 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41881 DAG.getBitcast(SrcVT, Res));
41882 }
41883
41884 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41885 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41886 if (Src.getOpcode() == ISD::BITCAST &&
41887 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
41888 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
41889 FixedVectorType::isValidElementType(
41890 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
41891 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
41892 VT.getVectorNumElements());
41893 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41894 }
41895
41896 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41897 // If we're re-broadcasting a smaller type then broadcast with that type and
41898 // bitcast.
41899 // TODO: Do this for any splat?
41900 if (Src.getOpcode() == ISD::BITCAST &&
41901 (BC.getOpcode() == X86ISD::VBROADCAST ||
41902 BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
41903 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
41904 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
41905 MVT NewVT =
41906 MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
41907 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
41908 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41909 }
41910
41911 // Reduce broadcast source vector to lowest 128-bits.
41912 if (SrcVT.getSizeInBits() > 128)
41913 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41914 extract128BitVector(Src, 0, DAG, DL));
41915
41916 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41917 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
41918 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41919
41920 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41921 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41922 isNullConstant(Src.getOperand(1)) &&
41923 DAG.getTargetLoweringInfo().isTypeLegal(
41924 Src.getOperand(0).getValueType()))
41925 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41926
41927 // Share broadcast with the longest vector and extract low subvector (free).
41928 // Ensure the same SDValue from the SDNode use is being used.
41929 for (SDNode *User : Src->uses())
41930 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41931 Src == User->getOperand(0) &&
41932 User->getValueSizeInBits(0).getFixedValue() >
41933 VT.getFixedSizeInBits()) {
41934 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
41935 VT.getSizeInBits());
41936 }
41937
41938 // vbroadcast(scalarload X) -> vbroadcast_load X
41939 // For float loads, extract other uses of the scalar from the broadcast.
41940 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
41941 ISD::isNormalLoad(Src.getNode())) {
41942 LoadSDNode *LN = cast<LoadSDNode>(Src);
41943 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41944 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41945 SDValue BcastLd =
41946 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41947 LN->getMemoryVT(), LN->getMemOperand());
41948 // If the load value is used only by N, replace it via CombineTo N.
41949 bool NoReplaceExtract = Src.hasOneUse();
41950 DCI.CombineTo(N.getNode(), BcastLd);
41951 if (NoReplaceExtract) {
41952 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41953 DCI.recursivelyDeleteUnusedNodes(LN);
41954 } else {
41955 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
41956 DAG.getIntPtrConstant(0, DL));
41957 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
41958 }
41959 return N; // Return N so it doesn't get rechecked!
41960 }
41961
41962 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
41963 // i16. So shrink it ourselves if we can make a broadcast_load.
41964 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
41965 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
41966 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41966, __extension__
__PRETTY_FUNCTION__))
;
41967 SDValue TruncIn = Src.getOperand(0);
41968
41969 // If this is a truncate of a non extending load we can just narrow it to
41970 // use a broadcast_load.
41971 if (ISD::isNormalLoad(TruncIn.getNode())) {
41972 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41973 // Unless its volatile or atomic.
41974 if (LN->isSimple()) {
41975 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41976 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41977 SDValue BcastLd = DAG.getMemIntrinsicNode(
41978 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41979 LN->getPointerInfo(), LN->getOriginalAlign(),
41980 LN->getMemOperand()->getFlags());
41981 DCI.CombineTo(N.getNode(), BcastLd);
41982 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41983 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41984 return N; // Return N so it doesn't get rechecked!
41985 }
41986 }
41987
41988 // If this is a truncate of an i16 extload, we can directly replace it.
41989 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41990 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41991 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41992 if (LN->getMemoryVT().getSizeInBits() == 16) {
41993 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41994 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41995 SDValue BcastLd =
41996 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41997 LN->getMemoryVT(), LN->getMemOperand());
41998 DCI.CombineTo(N.getNode(), BcastLd);
41999 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42000 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42001 return N; // Return N so it doesn't get rechecked!
42002 }
42003 }
42004
42005 // If this is a truncate of load that has been shifted right, we can
42006 // offset the pointer and use a narrower load.
42007 if (TruncIn.getOpcode() == ISD::SRL &&
42008 TruncIn.getOperand(0).hasOneUse() &&
42009 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42010 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42011 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42012 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42013 // Make sure the shift amount and the load size are divisible by 16.
42014 // Don't do this if the load is volatile or atomic.
42015 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42016 LN->isSimple()) {
42017 unsigned Offset = ShiftAmt / 8;
42018 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42019 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
42020 TypeSize::Fixed(Offset), DL);
42021 SDValue Ops[] = { LN->getChain(), Ptr };
42022 SDValue BcastLd = DAG.getMemIntrinsicNode(
42023 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42024 LN->getPointerInfo().getWithOffset(Offset),
42025 LN->getOriginalAlign(),
42026 LN->getMemOperand()->getFlags());
42027 DCI.CombineTo(N.getNode(), BcastLd);
42028 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42029 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42030 return N; // Return N so it doesn't get rechecked!
42031 }
42032 }
42033 }
42034
42035 // vbroadcast(vzload X) -> vbroadcast_load X
42036 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42037 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
42038 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42039 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42040 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42041 SDValue BcastLd =
42042 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
42043 LN->getMemoryVT(), LN->getMemOperand());
42044 DCI.CombineTo(N.getNode(), BcastLd);
42045 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42046 DCI.recursivelyDeleteUnusedNodes(LN);
42047 return N; // Return N so it doesn't get rechecked!
42048 }
42049 }
42050
42051 // vbroadcast(vector load X) -> vbroadcast_load
42052 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
42053 SrcVT == MVT::v4i32) &&
42054 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42055 LoadSDNode *LN = cast<LoadSDNode>(Src);
42056 // Unless the load is volatile or atomic.
42057 if (LN->isSimple()) {
42058 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42059 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42060 SDValue BcastLd = DAG.getMemIntrinsicNode(
42061 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42062 LN->getPointerInfo(), LN->getOriginalAlign(),
42063 LN->getMemOperand()->getFlags());
42064 DCI.CombineTo(N.getNode(), BcastLd);
42065 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42066 DCI.recursivelyDeleteUnusedNodes(LN);
42067 return N; // Return N so it doesn't get rechecked!
42068 }
42069 }
42070
42071 return SDValue();
42072 }
42073 case X86ISD::VZEXT_MOVL: {
42074 SDValue N0 = N.getOperand(0);
42075
42076 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42077 // the load is volatile.
42078 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42079 auto *LN = cast<LoadSDNode>(N0);
42080 if (SDValue VZLoad =
42081 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42082 DCI.CombineTo(N.getNode(), VZLoad);
42083 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42084 DCI.recursivelyDeleteUnusedNodes(LN);
42085 return N;
42086 }
42087 }
42088
42089 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42090 // and can just use a VZEXT_LOAD.
42091 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42092 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42093 auto *LN = cast<MemSDNode>(N0);
42094 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42095 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42096 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42097 SDValue VZLoad =
42098 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
42099 LN->getMemoryVT(), LN->getMemOperand());
42100 DCI.CombineTo(N.getNode(), VZLoad);
42101 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42102 DCI.recursivelyDeleteUnusedNodes(LN);
42103 return N;
42104 }
42105 }
42106
42107 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42108 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42109 // if the upper bits of the i64 are zero.
42110 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42111 N0.getOperand(0).hasOneUse() &&
42112 N0.getOperand(0).getValueType() == MVT::i64) {
42113 SDValue In = N0.getOperand(0);
42114 APInt Mask = APInt::getHighBitsSet(64, 32);
42115 if (DAG.MaskedValueIsZero(In, Mask)) {
42116 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42117 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
42118 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42119 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42120 return DAG.getBitcast(VT, Movl);
42121 }
42122 }
42123
42124 // Load a scalar integer constant directly to XMM instead of transferring an
42125 // immediate value from GPR.
42126 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42127 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42128 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42129 // Create a vector constant - scalar constant followed by zeros.
42130 EVT ScalarVT = N0.getOperand(0).getValueType();
42131 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42132 unsigned NumElts = VT.getVectorNumElements();
42133 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42134 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42135 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42136
42137 // Load the vector constant from constant pool.
42138 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
42139 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42140 MachinePointerInfo MPI =
42141 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
42142 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42143 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42144 MachineMemOperand::MOLoad);
42145 }
42146 }
42147
42148 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42149 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42150 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42151 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42152 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42153 SDValue V = peekThroughOneUseBitcasts(N0);
42154
42155 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42156 isNullConstant(V.getOperand(2))) {
42157 SDValue In = V.getOperand(1);
42158 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
42159 In.getValueSizeInBits() /
42160 VT.getScalarSizeInBits());
42161 In = DAG.getBitcast(SubVT, In);
42162 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42163 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42164 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42165 V.getOperand(2));
42166 }
42167 }
42168
42169 return SDValue();
42170 }
42171 case X86ISD::BLENDI: {
42172 SDValue N0 = N.getOperand(0);
42173 SDValue N1 = N.getOperand(1);
42174
42175 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42176 // TODO: Handle MVT::v16i16 repeated blend mask.
42177 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
42178 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42179 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42180 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
42181 SrcVT.getScalarSizeInBits() >= 32) {
42182 unsigned BlendMask = N.getConstantOperandVal(2);
42183 unsigned Size = VT.getVectorNumElements();
42184 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
42185 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
42186 return DAG.getBitcast(
42187 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42188 N1.getOperand(0),
42189 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
42190 }
42191 }
42192 return SDValue();
42193 }
42194 case X86ISD::SHUFP: {
42195 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42196 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42197 // TODO: Support types other than v4f32.
42198 if (VT == MVT::v4f32) {
42199 bool Updated = false;
42200 SmallVector<int> Mask;
42201 SmallVector<SDValue> Ops;
42202 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
42203 Ops.size() == 2) {
42204 for (int i = 0; i != 2; ++i) {
42205 SmallVector<SDValue> SubOps;
42206 SmallVector<int> SubMask, SubScaledMask;
42207 SDValue Sub = peekThroughBitcasts(Ops[i]);
42208 // TODO: Scaling might be easier if we specify the demanded elts.
42209 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42210 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42211 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42212 int Ofs = i * 2;
42213 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42214 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42215 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42216 Updated = true;
42217 }
42218 }
42219 }
42220 if (Updated) {
42221 for (int &M : Mask)
42222 M %= 4;
42223 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42224 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42225 }
42226 }
42227 return SDValue();
42228 }
42229 case X86ISD::VPERMI: {
42230 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42231 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42232 SDValue N0 = N.getOperand(0);
42233 SDValue N1 = N.getOperand(1);
42234 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42235 if (N0.getOpcode() == ISD::BITCAST &&
42236 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42237 SDValue Src = N0.getOperand(0);
42238 EVT SrcVT = Src.getValueType();
42239 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42240 return DAG.getBitcast(VT, Res);
42241 }
42242 return SDValue();
42243 }
42244 case X86ISD::VPERM2X128: {
42245 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42246 SDValue LHS = N->getOperand(0);
42247 SDValue RHS = N->getOperand(1);
42248 if (LHS.getOpcode() == ISD::BITCAST &&
42249 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42250 EVT SrcVT = LHS.getOperand(0).getValueType();
42251 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42252 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42253 DAG.getBitcast(SrcVT, LHS),
42254 DAG.getBitcast(SrcVT, RHS),
42255 N->getOperand(2)));
42256 }
42257 }
42258
42259 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42260 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
42261 return Res;
42262
42263 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42264 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42265 auto FindSubVector128 = [&](unsigned Idx) {
42266 if (Idx > 3)
42267 return SDValue();
42268 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42269 SmallVector<SDValue> SubOps;
42270 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42271 return SubOps[Idx & 1];
42272 unsigned NumElts = Src.getValueType().getVectorNumElements();
42273 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42274 Src.getOperand(1).getValueSizeInBits() == 128 &&
42275 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42276 return Src.getOperand(1);
42277 }
42278 return SDValue();
42279 };
42280 unsigned Imm = N.getConstantOperandVal(2);
42281 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42282 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42283 MVT SubVT = VT.getHalfNumVectorElementsVT();
42284 SubLo = DAG.getBitcast(SubVT, SubLo);
42285 SubHi = DAG.getBitcast(SubVT, SubHi);
42286 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42287 }
42288 }
42289 return SDValue();
42290 }
42291 case X86ISD::PSHUFD:
42292 case X86ISD::PSHUFLW:
42293 case X86ISD::PSHUFHW: {
42294 SDValue N0 = N.getOperand(0);
42295 SDValue N1 = N.getOperand(1);
42296 if (N0->hasOneUse()) {
42297 SDValue V = peekThroughOneUseBitcasts(N0);
42298 switch (V.getOpcode()) {
42299 case X86ISD::VSHL:
42300 case X86ISD::VSRL:
42301 case X86ISD::VSRA:
42302 case X86ISD::VSHLI:
42303 case X86ISD::VSRLI:
42304 case X86ISD::VSRAI:
42305 case X86ISD::VROTLI:
42306 case X86ISD::VROTRI: {
42307 MVT InnerVT = V.getSimpleValueType();
42308 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42309 SDValue Res = DAG.getNode(Opcode, DL, VT,
42310 DAG.getBitcast(VT, V.getOperand(0)), N1);
42311 Res = DAG.getBitcast(InnerVT, Res);
42312 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42313 return DAG.getBitcast(VT, Res);
42314 }
42315 break;
42316 }
42317 }
42318 }
42319
42320 Mask = getPSHUFShuffleMask(N);
42321 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42321, __extension__ __PRETTY_FUNCTION__))
;
42322 break;
42323 }
42324 case X86ISD::MOVSD:
42325 case X86ISD::MOVSH:
42326 case X86ISD::MOVSS: {
42327 SDValue N0 = N.getOperand(0);
42328 SDValue N1 = N.getOperand(1);
42329
42330 // Canonicalize scalar FPOps:
42331 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42332 // If commutable, allow OP(N1[0], N0[0]).
42333 unsigned Opcode1 = N1.getOpcode();
42334 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42335 Opcode1 == ISD::FDIV) {
42336 SDValue N10 = N1.getOperand(0);
42337 SDValue N11 = N1.getOperand(1);
42338 if (N10 == N0 ||
42339 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42340 if (N10 != N0)
42341 std::swap(N10, N11);
42342 MVT SVT = VT.getVectorElementType();
42343 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
42344 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42345 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42346 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42347 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42348 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42349 }
42350 }
42351
42352 return SDValue();
42353 }
42354 case X86ISD::INSERTPS: {
42355 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42355, __extension__
__PRETTY_FUNCTION__))
;
42356 SDValue Op0 = N.getOperand(0);
42357 SDValue Op1 = N.getOperand(1);
42358 unsigned InsertPSMask = N.getConstantOperandVal(2);
42359 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42360 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42361 unsigned ZeroMask = InsertPSMask & 0xF;
42362
42363 // If we zero out all elements from Op0 then we don't need to reference it.
42364 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42365 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42366 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42367
42368 // If we zero out the element from Op1 then we don't need to reference it.
42369 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42370 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42371 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42372
42373 // Attempt to merge insertps Op1 with an inner target shuffle node.
42374 SmallVector<int, 8> TargetMask1;
42375 SmallVector<SDValue, 2> Ops1;
42376 APInt KnownUndef1, KnownZero1;
42377 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42378 KnownZero1)) {
42379 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42380 // Zero/UNDEF insertion - zero out element and remove dependency.
42381 InsertPSMask |= (1u << DstIdx);
42382 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42383 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42384 }
42385 // Update insertps mask srcidx and reference the source input directly.
42386 int M = TargetMask1[SrcIdx];
42387 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42387, __extension__
__PRETTY_FUNCTION__))
;
42388 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42389 Op1 = Ops1[M < 4 ? 0 : 1];
42390 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42391 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42392 }
42393
42394 // Attempt to merge insertps Op0 with an inner target shuffle node.
42395 SmallVector<int, 8> TargetMask0;
42396 SmallVector<SDValue, 2> Ops0;
42397 APInt KnownUndef0, KnownZero0;
42398 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42399 KnownZero0)) {
42400 bool Updated = false;
42401 bool UseInput00 = false;
42402 bool UseInput01 = false;
42403 for (int i = 0; i != 4; ++i) {
42404 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42405 // No change if element is already zero or the inserted element.
42406 continue;
42407 }
42408
42409 if (KnownUndef0[i] || KnownZero0[i]) {
42410 // If the target mask is undef/zero then we must zero the element.
42411 InsertPSMask |= (1u << i);
42412 Updated = true;
42413 continue;
42414 }
42415
42416 // The input vector element must be inline.
42417 int M = TargetMask0[i];
42418 if (M != i && M != (i + 4))
42419 return SDValue();
42420
42421 // Determine which inputs of the target shuffle we're using.
42422 UseInput00 |= (0 <= M && M < 4);
42423 UseInput01 |= (4 <= M);
42424 }
42425
42426 // If we're not using both inputs of the target shuffle then use the
42427 // referenced input directly.
42428 if (UseInput00 && !UseInput01) {
42429 Updated = true;
42430 Op0 = Ops0[0];
42431 } else if (!UseInput00 && UseInput01) {
42432 Updated = true;
42433 Op0 = Ops0[1];
42434 }
42435
42436 if (Updated)
42437 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42438 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42439 }
42440
42441 // If we're inserting an element from a vbroadcast load, fold the
42442 // load into the X86insertps instruction. We need to convert the scalar
42443 // load to a vector and clear the source lane of the INSERTPS control.
42444 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42445 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42446 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42447 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42448 MemIntr->getBasePtr(),
42449 MemIntr->getMemOperand());
42450 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42451 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
42452 Load),
42453 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42454 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42455 return Insert;
42456 }
42457 }
42458
42459 return SDValue();
42460 }
42461 default:
42462 return SDValue();
42463 }
42464
42465 // Nuke no-op shuffles that show up after combining.
42466 if (isNoopShuffleMask(Mask))
42467 return N.getOperand(0);
42468
42469 // Look for simplifications involving one or two shuffle instructions.
42470 SDValue V = N.getOperand(0);
42471 switch (N.getOpcode()) {
42472 default:
42473 break;
42474 case X86ISD::PSHUFLW:
42475 case X86ISD::PSHUFHW:
42476 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42476, __extension__
__PRETTY_FUNCTION__))
;
42477
42478 // See if this reduces to a PSHUFD which is no more expensive and can
42479 // combine with more operations. Note that it has to at least flip the
42480 // dwords as otherwise it would have been removed as a no-op.
42481 if (ArrayRef(Mask).equals({2, 3, 0, 1})) {
42482 int DMask[] = {0, 1, 2, 3};
42483 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42484 DMask[DOffset + 0] = DOffset + 1;
42485 DMask[DOffset + 1] = DOffset + 0;
42486 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
42487 V = DAG.getBitcast(DVT, V);
42488 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42489 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42490 return DAG.getBitcast(VT, V);
42491 }
42492
42493 // Look for shuffle patterns which can be implemented as a single unpack.
42494 // FIXME: This doesn't handle the location of the PSHUFD generically, and
42495 // only works when we have a PSHUFD followed by two half-shuffles.
42496 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42497 (V.getOpcode() == X86ISD::PSHUFLW ||
42498 V.getOpcode() == X86ISD::PSHUFHW) &&
42499 V.getOpcode() != N.getOpcode() &&
42500 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42501 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42502 if (D.getOpcode() == X86ISD::PSHUFD) {
42503 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
42504 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
42505 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42506 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42507 int WordMask[8];
42508 for (int i = 0; i < 4; ++i) {
42509 WordMask[i + NOffset] = Mask[i] + NOffset;
42510 WordMask[i + VOffset] = VMask[i] + VOffset;
42511 }
42512 // Map the word mask through the DWord mask.
42513 int MappedMask[8];
42514 for (int i = 0; i < 8; ++i)
42515 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42516 if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42517 ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42518 // We can replace all three shuffles with an unpack.
42519 V = DAG.getBitcast(VT, D.getOperand(0));
42520 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42521 : X86ISD::UNPCKH,
42522 DL, VT, V, V);
42523 }
42524 }
42525 }
42526
42527 break;
42528
42529 case X86ISD::PSHUFD:
42530 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
42531 return NewN;
42532
42533 break;
42534 }
42535
42536 return SDValue();
42537}
42538
42539/// Checks if the shuffle mask takes subsequent elements
42540/// alternately from two vectors.
42541/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42542static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42543
42544 int ParitySrc[2] = {-1, -1};
42545 unsigned Size = Mask.size();
42546 for (unsigned i = 0; i != Size; ++i) {
42547 int M = Mask[i];
42548 if (M < 0)
42549 continue;
42550
42551 // Make sure we are using the matching element from the input.
42552 if ((M % Size) != i)
42553 return false;
42554
42555 // Make sure we use the same input for all elements of the same parity.
42556 int Src = M / Size;
42557 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42558 return false;
42559 ParitySrc[i % 2] = Src;
42560 }
42561
42562 // Make sure each input is used.
42563 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42564 return false;
42565
42566 Op0Even = ParitySrc[0] == 0;
42567 return true;
42568}
42569
42570/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42571/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42572/// are written to the parameters \p Opnd0 and \p Opnd1.
42573///
42574/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42575/// so it is easier to generically match. We also insert dummy vector shuffle
42576/// nodes for the operands which explicitly discard the lanes which are unused
42577/// by this operation to try to flow through the rest of the combiner
42578/// the fact that they're unused.
42579static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42580 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42581 bool &IsSubAdd) {
42582
42583 EVT VT = N->getValueType(0);
42584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42585 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42586 !VT.getSimpleVT().isFloatingPoint())
42587 return false;
42588
42589 // We only handle target-independent shuffles.
42590 // FIXME: It would be easy and harmless to use the target shuffle mask
42591 // extraction tool to support more.
42592 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42593 return false;
42594
42595 SDValue V1 = N->getOperand(0);
42596 SDValue V2 = N->getOperand(1);
42597
42598 // Make sure we have an FADD and an FSUB.
42599 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42600 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42601 V1.getOpcode() == V2.getOpcode())
42602 return false;
42603
42604 // If there are other uses of these operations we can't fold them.
42605 if (!V1->hasOneUse() || !V2->hasOneUse())
42606 return false;
42607
42608 // Ensure that both operations have the same operands. Note that we can
42609 // commute the FADD operands.
42610 SDValue LHS, RHS;
42611 if (V1.getOpcode() == ISD::FSUB) {
42612 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42613 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42614 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42615 return false;
42616 } else {
42617 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42617, __extension__
__PRETTY_FUNCTION__))
;
42618 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42619 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42620 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42621 return false;
42622 }
42623
42624 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42625 bool Op0Even;
42626 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42627 return false;
42628
42629 // It's a subadd if the vector in the even parity is an FADD.
42630 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42631 : V2->getOpcode() == ISD::FADD;
42632
42633 Opnd0 = LHS;
42634 Opnd1 = RHS;
42635 return true;
42636}
42637
42638/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42639static SDValue combineShuffleToFMAddSub(SDNode *N,
42640 const X86Subtarget &Subtarget,
42641 SelectionDAG &DAG) {
42642 // We only handle target-independent shuffles.
42643 // FIXME: It would be easy and harmless to use the target shuffle mask
42644 // extraction tool to support more.
42645 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42646 return SDValue();
42647
42648 MVT VT = N->getSimpleValueType(0);
42649 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42650 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42651 return SDValue();
42652
42653 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42654 SDValue Op0 = N->getOperand(0);
42655 SDValue Op1 = N->getOperand(1);
42656 SDValue FMAdd = Op0, FMSub = Op1;
42657 if (FMSub.getOpcode() != X86ISD::FMSUB)
42658 std::swap(FMAdd, FMSub);
42659
42660 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42661 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42662 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42663 FMAdd.getOperand(2) != FMSub.getOperand(2))
42664 return SDValue();
42665
42666 // Check for correct shuffle mask.
42667 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42668 bool Op0Even;
42669 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42670 return SDValue();
42671
42672 // FMAddSub takes zeroth operand from FMSub node.
42673 SDLoc DL(N);
42674 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42675 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42676 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42677 FMAdd.getOperand(2));
42678}
42679
42680/// Try to combine a shuffle into a target-specific add-sub or
42681/// mul-add-sub node.
42682static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
42683 const X86Subtarget &Subtarget,
42684 SelectionDAG &DAG) {
42685 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
42686 return V;
42687
42688 SDValue Opnd0, Opnd1;
42689 bool IsSubAdd;
42690 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42691 return SDValue();
42692
42693 MVT VT = N->getSimpleValueType(0);
42694 SDLoc DL(N);
42695
42696 // Try to generate X86ISD::FMADDSUB node here.
42697 SDValue Opnd2;
42698 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42699 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42700 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42701 }
42702
42703 if (IsSubAdd)
42704 return SDValue();
42705
42706 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42707 // the ADDSUB idiom has been successfully recognized. There are no known
42708 // X86 targets with 512-bit ADDSUB instructions!
42709 if (VT.is512BitVector())
42710 return SDValue();
42711
42712 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42713 // the ADDSUB idiom has been successfully recognized. There are no known
42714 // X86 targets with FP16 ADDSUB instructions!
42715 if (VT.getVectorElementType() == MVT::f16)
42716 return SDValue();
42717
42718 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42719}
42720
42721// We are looking for a shuffle where both sources are concatenated with undef
42722// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42723// if we can express this as a single-source shuffle, that's preferable.
42724static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
42725 const X86Subtarget &Subtarget) {
42726 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42727 return SDValue();
42728
42729 EVT VT = N->getValueType(0);
42730
42731 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42732 if (!VT.is128BitVector() && !VT.is256BitVector())
42733 return SDValue();
42734
42735 if (VT.getVectorElementType() != MVT::i32 &&
42736 VT.getVectorElementType() != MVT::i64 &&
42737 VT.getVectorElementType() != MVT::f32 &&
42738 VT.getVectorElementType() != MVT::f64)
42739 return SDValue();
42740
42741 SDValue N0 = N->getOperand(0);
42742 SDValue N1 = N->getOperand(1);
42743
42744 // Check that both sources are concats with undef.
42745 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42746 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42747 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42748 !N1.getOperand(1).isUndef())
42749 return SDValue();
42750
42751 // Construct the new shuffle mask. Elements from the first source retain their
42752 // index, but elements from the second source no longer need to skip an undef.
42753 SmallVector<int, 8> Mask;
42754 int NumElts = VT.getVectorNumElements();
42755
42756 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
42757 for (int Elt : SVOp->getMask())
42758 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42759
42760 SDLoc DL(N);
42761 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
42762 N1.getOperand(0));
42763 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42764}
42765
42766/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42767/// low half of each source vector and does not set any high half elements in
42768/// the destination vector, narrow the shuffle to half its original size.
42769static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
42770 if (!Shuf->getValueType(0).isSimple())
42771 return SDValue();
42772 MVT VT = Shuf->getSimpleValueType(0);
42773 if (!VT.is256BitVector() && !VT.is512BitVector())
42774 return SDValue();
42775
42776 // See if we can ignore all of the high elements of the shuffle.
42777 ArrayRef<int> Mask = Shuf->getMask();
42778 if (!isUndefUpperHalf(Mask))
42779 return SDValue();
42780
42781 // Check if the shuffle mask accesses only the low half of each input vector
42782 // (half-index output is 0 or 2).
42783 int HalfIdx1, HalfIdx2;
42784 SmallVector<int, 8> HalfMask(Mask.size() / 2);
42785 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42786 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42787 return SDValue();
42788
42789 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42790 // The trick is knowing that all of the insert/extract are actually free
42791 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42792 // of narrow inputs into a narrow output, and that is always cheaper than
42793 // the wide shuffle that we started with.
42794 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42795 Shuf->getOperand(1), HalfMask, HalfIdx1,
42796 HalfIdx2, false, DAG, /*UseConcat*/true);
42797}
42798
42799static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
42800 TargetLowering::DAGCombinerInfo &DCI,
42801 const X86Subtarget &Subtarget) {
42802 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42803 if (SDValue V = narrowShuffle(Shuf, DAG))
42804 return V;
42805
42806 // If we have legalized the vector types, look for blends of FADD and FSUB
42807 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42808 SDLoc dl(N);
42809 EVT VT = N->getValueType(0);
42810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42811 if (TLI.isTypeLegal(VT))
42812 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
42813 return AddSub;
42814
42815 // Attempt to combine into a vector load/broadcast.
42816 if (SDValue LD = combineToConsecutiveLoads(
42817 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42818 return LD;
42819
42820 // For AVX2, we sometimes want to combine
42821 // (vector_shuffle <mask> (concat_vectors t1, undef)
42822 // (concat_vectors t2, undef))
42823 // Into:
42824 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42825 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42826 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
42827 return ShufConcat;
42828
42829 if (isTargetShuffle(N->getOpcode())) {
42830 SDValue Op(N, 0);
42831 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
42832 return Shuffle;
42833
42834 // Try recursively combining arbitrary sequences of x86 shuffle
42835 // instructions into higher-order shuffles. We do this after combining
42836 // specific PSHUF instruction sequences into their minimal form so that we
42837 // can evaluate how many specialized shuffle instructions are involved in
42838 // a particular chain.
42839 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42840 return Res;
42841
42842 // Simplify source operands based on shuffle mask.
42843 // TODO - merge this into combineX86ShufflesRecursively.
42844 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
42845 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
42846 return SDValue(N, 0);
42847
42848 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42849 // Perform this after other shuffle combines to allow inner shuffles to be
42850 // combined away first.
42851 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
42852 return BinOp;
42853 }
42854
42855 return SDValue();
42856}
42857
42858// Simplify variable target shuffle masks based on the demanded elements.
42859// TODO: Handle DemandedBits in mask indices as well?
42860bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
42861 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
42862 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42863 // If we're demanding all elements don't bother trying to simplify the mask.
42864 unsigned NumElts = DemandedElts.getBitWidth();
42865 if (DemandedElts.isAllOnes())
42866 return false;
42867
42868 SDValue Mask = Op.getOperand(MaskIndex);
42869 if (!Mask.hasOneUse())
42870 return false;
42871
42872 // Attempt to generically simplify the variable shuffle mask.
42873 APInt MaskUndef, MaskZero;
42874 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
42875 Depth + 1))
42876 return true;
42877
42878 // Attempt to extract+simplify a (constant pool load) shuffle mask.
42879 // TODO: Support other types from getTargetShuffleMaskIndices?
42880 SDValue BC = peekThroughOneUseBitcasts(Mask);
42881 EVT BCVT = BC.getValueType();
42882 auto *Load = dyn_cast<LoadSDNode>(BC);
42883 if (!Load)
42884 return false;
42885
42886 const Constant *C = getTargetConstantFromNode(Load);
42887 if (!C)
42888 return false;
42889
42890 Type *CTy = C->getType();
42891 if (!CTy->isVectorTy() ||
42892 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42893 return false;
42894
42895 // Handle scaling for i64 elements on 32-bit targets.
42896 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42897 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
42898 return false;
42899 unsigned Scale = NumCstElts / NumElts;
42900
42901 // Simplify mask if we have an undemanded element that is not undef.
42902 bool Simplified = false;
42903 SmallVector<Constant *, 32> ConstVecOps;
42904 for (unsigned i = 0; i != NumCstElts; ++i) {
42905 Constant *Elt = C->getAggregateElement(i);
42906 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
42907 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42908 Simplified = true;
42909 continue;
42910 }
42911 ConstVecOps.push_back(Elt);
42912 }
42913 if (!Simplified)
42914 return false;
42915
42916 // Generate new constant pool entry + legalize immediately for the load.
42917 SDLoc DL(Op);
42918 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
42919 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
42920 SDValue NewMask = TLO.DAG.getLoad(
42921 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
42922 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
42923 Load->getAlign());
42924 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
42925}
42926
42927bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
42928 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
42929 TargetLoweringOpt &TLO, unsigned Depth) const {
42930 int NumElts = DemandedElts.getBitWidth();
42931 unsigned Opc = Op.getOpcode();
42932 EVT VT = Op.getValueType();
42933
42934 // Handle special case opcodes.
42935 switch (Opc) {
42936 case X86ISD::PMULDQ:
42937 case X86ISD::PMULUDQ: {
42938 APInt LHSUndef, LHSZero;
42939 APInt RHSUndef, RHSZero;
42940 SDValue LHS = Op.getOperand(0);
42941 SDValue RHS = Op.getOperand(1);
42942 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42943 Depth + 1))
42944 return true;
42945 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42946 Depth + 1))
42947 return true;
42948 // Multiply by zero.
42949 KnownZero = LHSZero | RHSZero;
42950 break;
42951 }
42952 case X86ISD::VPMADDWD: {
42953 APInt LHSUndef, LHSZero;
42954 APInt RHSUndef, RHSZero;
42955 SDValue LHS = Op.getOperand(0);
42956 SDValue RHS = Op.getOperand(1);
42957 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
42958
42959 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
42960 Depth + 1))
42961 return true;
42962 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
42963 Depth + 1))
42964 return true;
42965
42966 // TODO: Multiply by zero.
42967
42968 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
42969 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
42970 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
42971 Depth + 1))
42972 return true;
42973 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
42974 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
42975 Depth + 1))
42976 return true;
42977 break;
42978 }
42979 case X86ISD::PSADBW: {
42980 SDValue LHS = Op.getOperand(0);
42981 SDValue RHS = Op.getOperand(1);
42982 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
42983 LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
42984 LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
42985 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))
;
42986
42987 // Aggressively peek through ops to get at the demanded elts.
42988 if (!DemandedElts.isAllOnes()) {
42989 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
42990 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
42991 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
42992 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42993 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
42994 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42995 if (NewLHS || NewRHS) {
42996 NewLHS = NewLHS ? NewLHS : LHS;
42997 NewRHS = NewRHS ? NewRHS : RHS;
42998 return TLO.CombineTo(
42999 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43000 }
43001 }
43002 break;
43003 }
43004 case X86ISD::VSHL:
43005 case X86ISD::VSRL:
43006 case X86ISD::VSRA: {
43007 // We only need the bottom 64-bits of the (128-bit) shift amount.
43008 SDValue Amt = Op.getOperand(1);
43009 MVT AmtVT = Amt.getSimpleValueType();
43010 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43010, __extension__
__PRETTY_FUNCTION__))
;
43011
43012 // If we reuse the shift amount just for sse shift amounts then we know that
43013 // only the bottom 64-bits are only ever used.
43014 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
43015 unsigned UseOpc = Use->getOpcode();
43016 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43017 UseOpc == X86ISD::VSRA) &&
43018 Use->getOperand(0) != Amt;
43019 });
43020
43021 APInt AmtUndef, AmtZero;
43022 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43023 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43024 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43025 Depth + 1, AssumeSingleUse))
43026 return true;
43027 [[fallthrough]];
43028 }
43029 case X86ISD::VSHLI:
43030 case X86ISD::VSRLI:
43031 case X86ISD::VSRAI: {
43032 SDValue Src = Op.getOperand(0);
43033 APInt SrcUndef;
43034 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43035 Depth + 1))
43036 return true;
43037
43038 // Fold shift(0,x) -> 0
43039 if (DemandedElts.isSubsetOf(KnownZero))
43040 return TLO.CombineTo(
43041 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43042
43043 // Aggressively peek through ops to get at the demanded elts.
43044 if (!DemandedElts.isAllOnes())
43045 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43046 Src, DemandedElts, TLO.DAG, Depth + 1))
43047 return TLO.CombineTo(
43048 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43049 break;
43050 }
43051 case X86ISD::VPSHA:
43052 case X86ISD::VPSHL:
43053 case X86ISD::VSHLV:
43054 case X86ISD::VSRLV:
43055 case X86ISD::VSRAV: {
43056 APInt LHSUndef, LHSZero;
43057 APInt RHSUndef, RHSZero;
43058 SDValue LHS = Op.getOperand(0);
43059 SDValue RHS = Op.getOperand(1);
43060 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43061 Depth + 1))
43062 return true;
43063
43064 // Fold shift(0,x) -> 0
43065 if (DemandedElts.isSubsetOf(LHSZero))
43066 return TLO.CombineTo(
43067 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43068
43069 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43070 Depth + 1))
43071 return true;
43072
43073 KnownZero = LHSZero;
43074 break;
43075 }
43076 case X86ISD::KSHIFTL: {
43077 SDValue Src = Op.getOperand(0);
43078 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43079 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43079, __extension__
__PRETTY_FUNCTION__))
;
43080 unsigned ShiftAmt = Amt->getZExtValue();
43081
43082 if (ShiftAmt == 0)
43083 return TLO.CombineTo(Op, Src);
43084
43085 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43086 // single shift. We can do this if the bottom bits (which are shifted
43087 // out) are never demanded.
43088 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43089 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43090 unsigned C1 = Src.getConstantOperandVal(1);
43091 unsigned NewOpc = X86ISD::KSHIFTL;
43092 int Diff = ShiftAmt - C1;
43093 if (Diff < 0) {
43094 Diff = -Diff;
43095 NewOpc = X86ISD::KSHIFTR;
43096 }
43097
43098 SDLoc dl(Op);
43099 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43100 return TLO.CombineTo(
43101 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43102 }
43103 }
43104
43105 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43106 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43107 Depth + 1))
43108 return true;
43109
43110 KnownUndef <<= ShiftAmt;
43111 KnownZero <<= ShiftAmt;
43112 KnownZero.setLowBits(ShiftAmt);
43113 break;
43114 }
43115 case X86ISD::KSHIFTR: {
43116 SDValue Src = Op.getOperand(0);
43117 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43118 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43118, __extension__
__PRETTY_FUNCTION__))
;
43119 unsigned ShiftAmt = Amt->getZExtValue();
43120
43121 if (ShiftAmt == 0)
43122 return TLO.CombineTo(Op, Src);
43123
43124 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43125 // single shift. We can do this if the top bits (which are shifted
43126 // out) are never demanded.
43127 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43128 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43129 unsigned C1 = Src.getConstantOperandVal(1);
43130 unsigned NewOpc = X86ISD::KSHIFTR;
43131 int Diff = ShiftAmt - C1;
43132 if (Diff < 0) {
43133 Diff = -Diff;
43134 NewOpc = X86ISD::KSHIFTL;
43135 }
43136
43137 SDLoc dl(Op);
43138 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43139 return TLO.CombineTo(
43140 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43141 }
43142 }
43143
43144 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43145 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43146 Depth + 1))
43147 return true;
43148
43149 KnownUndef.lshrInPlace(ShiftAmt);
43150 KnownZero.lshrInPlace(ShiftAmt);
43151 KnownZero.setHighBits(ShiftAmt);
43152 break;
43153 }
43154 case X86ISD::ANDNP: {
43155 // ANDNP = (~LHS & RHS);
43156 SDValue LHS = Op.getOperand(0);
43157 SDValue RHS = Op.getOperand(1);
43158
43159 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43160 APInt UndefElts;
43161 SmallVector<APInt> EltBits;
43162 int NumElts = VT.getVectorNumElements();
43163 int EltSizeInBits = VT.getScalarSizeInBits();
43164 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43165 APInt OpElts = DemandedElts;
43166 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43167 EltBits)) {
43168 OpBits.clearAllBits();
43169 OpElts.clearAllBits();
43170 for (int I = 0; I != NumElts; ++I) {
43171 if (!DemandedElts[I])
43172 continue;
43173 if (UndefElts[I]) {
43174 // We can't assume an undef src element gives an undef dst - the
43175 // other src might be zero.
43176 OpBits.setAllBits();
43177 OpElts.setBit(I);
43178 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43179 (!Invert && !EltBits[I].isZero())) {
43180 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43181 OpElts.setBit(I);
43182 }
43183 }
43184 }
43185 return std::make_pair(OpBits, OpElts);
43186 };
43187 APInt BitsLHS, EltsLHS;
43188 APInt BitsRHS, EltsRHS;
43189 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43190 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43191
43192 APInt LHSUndef, LHSZero;
43193 APInt RHSUndef, RHSZero;
43194 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43195 Depth + 1))
43196 return true;
43197 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43198 Depth + 1))
43199 return true;
43200
43201 if (!DemandedElts.isAllOnes()) {
43202 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43203 TLO.DAG, Depth + 1);
43204 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43205 TLO.DAG, Depth + 1);
43206 if (NewLHS || NewRHS) {
43207 NewLHS = NewLHS ? NewLHS : LHS;
43208 NewRHS = NewRHS ? NewRHS : RHS;
43209 return TLO.CombineTo(
43210 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43211 }
43212 }
43213 break;
43214 }
43215 case X86ISD::CVTSI2P:
43216 case X86ISD::CVTUI2P: {
43217 SDValue Src = Op.getOperand(0);
43218 MVT SrcVT = Src.getSimpleValueType();
43219 APInt SrcUndef, SrcZero;
43220 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43221 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43222 Depth + 1))
43223 return true;
43224 break;
43225 }
43226 case X86ISD::PACKSS:
43227 case X86ISD::PACKUS: {
43228 SDValue N0 = Op.getOperand(0);
43229 SDValue N1 = Op.getOperand(1);
43230
43231 APInt DemandedLHS, DemandedRHS;
43232 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43233
43234 APInt LHSUndef, LHSZero;
43235 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43236 Depth + 1))
43237 return true;
43238 APInt RHSUndef, RHSZero;
43239 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43240 Depth + 1))
43241 return true;
43242
43243 // TODO - pass on known zero/undef.
43244
43245 // Aggressively peek through ops to get at the demanded elts.
43246 // TODO - we should do this for all target/faux shuffles ops.
43247 if (!DemandedElts.isAllOnes()) {
43248 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43249 TLO.DAG, Depth + 1);
43250 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43251 TLO.DAG, Depth + 1);
43252 if (NewN0 || NewN1) {
43253 NewN0 = NewN0 ? NewN0 : N0;
43254 NewN1 = NewN1 ? NewN1 : N1;
43255 return TLO.CombineTo(Op,
43256 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43257 }
43258 }
43259 break;
43260 }
43261 case X86ISD::HADD:
43262 case X86ISD::HSUB:
43263 case X86ISD::FHADD:
43264 case X86ISD::FHSUB: {
43265 SDValue N0 = Op.getOperand(0);
43266 SDValue N1 = Op.getOperand(1);
43267
43268 APInt DemandedLHS, DemandedRHS;
43269 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43270
43271 APInt LHSUndef, LHSZero;
43272 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43273 Depth + 1))
43274 return true;
43275 APInt RHSUndef, RHSZero;
43276 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43277 Depth + 1))
43278 return true;
43279
43280 // TODO - pass on known zero/undef.
43281
43282 // Aggressively peek through ops to get at the demanded elts.
43283 // TODO: Handle repeated operands.
43284 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43285 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43286 TLO.DAG, Depth + 1);
43287 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43288 TLO.DAG, Depth + 1);
43289 if (NewN0 || NewN1) {
43290 NewN0 = NewN0 ? NewN0 : N0;
43291 NewN1 = NewN1 ? NewN1 : N1;
43292 return TLO.CombineTo(Op,
43293 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43294 }
43295 }
43296 break;
43297 }
43298 case X86ISD::VTRUNC:
43299 case X86ISD::VTRUNCS:
43300 case X86ISD::VTRUNCUS: {
43301 SDValue Src = Op.getOperand(0);
43302 MVT SrcVT = Src.getSimpleValueType();
43303 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43304 APInt SrcUndef, SrcZero;
43305 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43306 Depth + 1))
43307 return true;
43308 KnownZero = SrcZero.zextOrTrunc(NumElts);
43309 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43310 break;
43311 }
43312 case X86ISD::BLENDV: {
43313 APInt SelUndef, SelZero;
43314 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43315 SelZero, TLO, Depth + 1))
43316 return true;
43317
43318 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43319 APInt LHSUndef, LHSZero;
43320 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43321 LHSZero, TLO, Depth + 1))
43322 return true;
43323
43324 APInt RHSUndef, RHSZero;
43325 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43326 RHSZero, TLO, Depth + 1))
43327 return true;
43328
43329 KnownZero = LHSZero & RHSZero;
43330 KnownUndef = LHSUndef & RHSUndef;
43331 break;
43332 }
43333 case X86ISD::VZEXT_MOVL: {
43334 // If upper demanded elements are already zero then we have nothing to do.
43335 SDValue Src = Op.getOperand(0);
43336 APInt DemandedUpperElts = DemandedElts;
43337 DemandedUpperElts.clearLowBits(1);
43338 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43339 return TLO.CombineTo(Op, Src);
43340 break;
43341 }
43342 case X86ISD::VBROADCAST: {
43343 SDValue Src = Op.getOperand(0);
43344 MVT SrcVT = Src.getSimpleValueType();
43345 if (!SrcVT.isVector())
43346 break;
43347 // Don't bother broadcasting if we just need the 0'th element.
43348 if (DemandedElts == 1) {
43349 if (Src.getValueType() != VT)
43350 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43351 SDLoc(Op));
43352 return TLO.CombineTo(Op, Src);
43353 }
43354 APInt SrcUndef, SrcZero;
43355 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43356 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43357 Depth + 1))
43358 return true;
43359 // Aggressively peek through src to get at the demanded elt.
43360 // TODO - we should do this for all target/faux shuffles ops.
43361 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43362 Src, SrcElts, TLO.DAG, Depth + 1))
43363 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43364 break;
43365 }
43366 case X86ISD::VPERMV:
43367 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43368 Depth))
43369 return true;
43370 break;
43371 case X86ISD::PSHUFB:
43372 case X86ISD::VPERMV3:
43373 case X86ISD::VPERMILPV:
43374 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43375 Depth))
43376 return true;
43377 break;
43378 case X86ISD::VPPERM:
43379 case X86ISD::VPERMIL2:
43380 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43381 Depth))
43382 return true;
43383 break;
43384 }
43385
43386 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43387 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43388 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43389 if ((VT.is256BitVector() || VT.is512BitVector()) &&
43390 DemandedElts.lshr(NumElts / 2) == 0) {
43391 unsigned SizeInBits = VT.getSizeInBits();
43392 unsigned ExtSizeInBits = SizeInBits / 2;
43393
43394 // See if 512-bit ops only use the bottom 128-bits.
43395 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43396 ExtSizeInBits = SizeInBits / 4;
43397
43398 switch (Opc) {
43399 // Scalar broadcast.
43400 case X86ISD::VBROADCAST: {
43401 SDLoc DL(Op);
43402 SDValue Src = Op.getOperand(0);
43403 if (Src.getValueSizeInBits() > ExtSizeInBits)
43404 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43405 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43406 ExtSizeInBits / VT.getScalarSizeInBits());
43407 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43408 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43409 TLO.DAG, DL, ExtSizeInBits));
43410 }
43411 case X86ISD::VBROADCAST_LOAD: {
43412 SDLoc DL(Op);
43413 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43414 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43415 ExtSizeInBits / VT.getScalarSizeInBits());
43416 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43417 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43418 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43419 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43420 MemIntr->getMemOperand());
43421 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43422 Bcst.getValue(1));
43423 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43424 TLO.DAG, DL, ExtSizeInBits));
43425 }
43426 // Subvector broadcast.
43427 case X86ISD::SUBV_BROADCAST_LOAD: {
43428 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43429 EVT MemVT = MemIntr->getMemoryVT();
43430 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43431 SDLoc DL(Op);
43432 SDValue Ld =
43433 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43434 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43435 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43436 Ld.getValue(1));
43437 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43438 TLO.DAG, DL, ExtSizeInBits));
43439 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43440 SDLoc DL(Op);
43441 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43442 ExtSizeInBits / VT.getScalarSizeInBits());
43443 if (SDValue BcstLd =
43444 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43445 return TLO.CombineTo(Op,
43446 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43447 TLO.DAG, DL, ExtSizeInBits));
43448 }
43449 break;
43450 }
43451 // Byte shifts by immediate.
43452 case X86ISD::VSHLDQ:
43453 case X86ISD::VSRLDQ:
43454 // Shift by uniform.
43455 case X86ISD::VSHL:
43456 case X86ISD::VSRL:
43457 case X86ISD::VSRA:
43458 // Shift by immediate.
43459 case X86ISD::VSHLI:
43460 case X86ISD::VSRLI:
43461 case X86ISD::VSRAI: {
43462 SDLoc DL(Op);
43463 SDValue Ext0 =
43464 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43465 SDValue ExtOp =
43466 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43467 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43468 SDValue Insert =
43469 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43470 return TLO.CombineTo(Op, Insert);
43471 }
43472 case X86ISD::VPERMI: {
43473 // Simplify PERMPD/PERMQ to extract_subvector.
43474 // TODO: This should be done in shuffle combining.
43475 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43476 SmallVector<int, 4> Mask;
43477 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43478 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43479 SDLoc DL(Op);
43480 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43481 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43482 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43483 return TLO.CombineTo(Op, Insert);
43484 }
43485 }
43486 break;
43487 }
43488 case X86ISD::VPERM2X128: {
43489 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43490 SDLoc DL(Op);
43491 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43492 if (LoMask & 0x8)
43493 return TLO.CombineTo(
43494 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43495 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43496 unsigned SrcIdx = (LoMask & 0x2) >> 1;
43497 SDValue ExtOp =
43498 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43499 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43500 SDValue Insert =
43501 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43502 return TLO.CombineTo(Op, Insert);
43503 }
43504 // Zero upper elements.
43505 case X86ISD::VZEXT_MOVL:
43506 // Target unary shuffles by immediate:
43507 case X86ISD::PSHUFD:
43508 case X86ISD::PSHUFLW:
43509 case X86ISD::PSHUFHW:
43510 case X86ISD::VPERMILPI:
43511 // (Non-Lane Crossing) Target Shuffles.
43512 case X86ISD::VPERMILPV:
43513 case X86ISD::VPERMIL2:
43514 case X86ISD::PSHUFB:
43515 case X86ISD::UNPCKL:
43516 case X86ISD::UNPCKH:
43517 case X86ISD::BLENDI:
43518 // Integer ops.
43519 case X86ISD::PACKSS:
43520 case X86ISD::PACKUS:
43521 // Horizontal Ops.
43522 case X86ISD::HADD:
43523 case X86ISD::HSUB:
43524 case X86ISD::FHADD:
43525 case X86ISD::FHSUB: {
43526 SDLoc DL(Op);
43527 SmallVector<SDValue, 4> Ops;
43528 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43529 SDValue SrcOp = Op.getOperand(i);
43530 EVT SrcVT = SrcOp.getValueType();
43531 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))
43532 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))
;
43533 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43534 ExtSizeInBits)
43535 : SrcOp);
43536 }
43537 MVT ExtVT = VT.getSimpleVT();
43538 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43539 ExtSizeInBits / ExtVT.getScalarSizeInBits());
43540 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43541 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43542 SDValue Insert =
43543 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43544 return TLO.CombineTo(Op, Insert);
43545 }
43546 }
43547 }
43548
43549 // For splats, unless we *only* demand the 0'th element,
43550 // stop attempts at simplification here, we aren't going to improve things,
43551 // this is better than any potential shuffle.
43552 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43553 return false;
43554
43555 // Get target/faux shuffle mask.
43556 APInt OpUndef, OpZero;
43557 SmallVector<int, 64> OpMask;
43558 SmallVector<SDValue, 2> OpInputs;
43559 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43560 OpZero, TLO.DAG, Depth, false))
43561 return false;
43562
43563 // Shuffle inputs must be the same size as the result.
43564 if (OpMask.size() != (unsigned)NumElts ||
43565 llvm::any_of(OpInputs, [VT](SDValue V) {
43566 return VT.getSizeInBits() != V.getValueSizeInBits() ||
43567 !V.getValueType().isVector();
43568 }))
43569 return false;
43570
43571 KnownZero = OpZero;
43572 KnownUndef = OpUndef;
43573
43574 // Check if shuffle mask can be simplified to undef/zero/identity.
43575 int NumSrcs = OpInputs.size();
43576 for (int i = 0; i != NumElts; ++i)
43577 if (!DemandedElts[i])
43578 OpMask[i] = SM_SentinelUndef;
43579
43580 if (isUndefInRange(OpMask, 0, NumElts)) {
43581 KnownUndef.setAllBits();
43582 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43583 }
43584 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43585 KnownZero.setAllBits();
43586 return TLO.CombineTo(
43587 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43588 }
43589 for (int Src = 0; Src != NumSrcs; ++Src)
43590 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43591 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43592
43593 // Attempt to simplify inputs.
43594 for (int Src = 0; Src != NumSrcs; ++Src) {
43595 // TODO: Support inputs of different types.
43596 if (OpInputs[Src].getValueType() != VT)
43597 continue;
43598
43599 int Lo = Src * NumElts;
43600 APInt SrcElts = APInt::getZero(NumElts);
43601 for (int i = 0; i != NumElts; ++i)
43602 if (DemandedElts[i]) {
43603 int M = OpMask[i] - Lo;
43604 if (0 <= M && M < NumElts)
43605 SrcElts.setBit(M);
43606 }
43607
43608 // TODO - Propagate input undef/zero elts.
43609 APInt SrcUndef, SrcZero;
43610 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43611 TLO, Depth + 1))
43612 return true;
43613 }
43614
43615 // If we don't demand all elements, then attempt to combine to a simpler
43616 // shuffle.
43617 // We need to convert the depth to something combineX86ShufflesRecursively
43618 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43619 // to match. This prevents combineX86ShuffleChain from returning a
43620 // combined shuffle that's the same as the original root, causing an
43621 // infinite loop.
43622 if (!DemandedElts.isAllOnes()) {
43623 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43623, __extension__
__PRETTY_FUNCTION__))
;
43624
43625 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43626 for (int i = 0; i != NumElts; ++i)
43627 if (DemandedElts[i])
43628 DemandedMask[i] = i;
43629
43630 SDValue NewShuffle = combineX86ShufflesRecursively(
43631 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43632 /*HasVarMask*/ false,
43633 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43634 Subtarget);
43635 if (NewShuffle)
43636 return TLO.CombineTo(Op, NewShuffle);
43637 }
43638
43639 return false;
43640}
43641
43642bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
43643 SDValue Op, const APInt &OriginalDemandedBits,
43644 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43645 unsigned Depth) const {
43646 EVT VT = Op.getValueType();
43647 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43648 unsigned Opc = Op.getOpcode();
43649 switch(Opc) {
43650 case X86ISD::VTRUNC: {
43651 KnownBits KnownOp;
43652 SDValue Src = Op.getOperand(0);
43653 MVT SrcVT = Src.getSimpleValueType();
43654
43655 // Simplify the input, using demanded bit information.
43656 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43657 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43658 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43659 return true;
43660 break;
43661 }
43662 case X86ISD::PMULDQ:
43663 case X86ISD::PMULUDQ: {
43664 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43665 KnownBits KnownLHS, KnownRHS;
43666 SDValue LHS = Op.getOperand(0);
43667 SDValue RHS = Op.getOperand(1);
43668
43669 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43670 // FIXME: Can we bound this better?
43671 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43672 APInt DemandedMaskLHS = APInt::getAllOnes(64);
43673 APInt DemandedMaskRHS = APInt::getAllOnes(64);
43674
43675 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43676 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43677 DemandedMaskLHS = DemandedMask;
43678 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43679 DemandedMaskRHS = DemandedMask;
43680
43681 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43682 KnownLHS, TLO, Depth + 1))
43683 return true;
43684 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43685 KnownRHS, TLO, Depth + 1))
43686 return true;
43687
43688 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43689 KnownRHS = KnownRHS.trunc(32);
43690 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43691 KnownRHS.getConstant().isOne()) {
43692 SDLoc DL(Op);
43693 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43694 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43695 }
43696
43697 // Aggressively peek through ops to get at the demanded low bits.
43698 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
43699 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43700 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
43701 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43702 if (DemandedLHS || DemandedRHS) {
43703 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43704 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43705 return TLO.CombineTo(
43706 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43707 }
43708 break;
43709 }
43710 case X86ISD::VSHLI: {
43711 SDValue Op0 = Op.getOperand(0);
43712
43713 unsigned ShAmt = Op.getConstantOperandVal(1);
43714 if (ShAmt >= BitWidth)
43715 break;
43716
43717 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43718
43719 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43720 // single shift. We can do this if the bottom bits (which are shifted
43721 // out) are never demanded.
43722 if (Op0.getOpcode() == X86ISD::VSRLI &&
43723 OriginalDemandedBits.countr_zero() >= ShAmt) {
43724 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43725 if (Shift2Amt < BitWidth) {
43726 int Diff = ShAmt - Shift2Amt;
43727 if (Diff == 0)
43728 return TLO.CombineTo(Op, Op0.getOperand(0));
43729
43730 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43731 SDValue NewShift = TLO.DAG.getNode(
43732 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43733 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43734 return TLO.CombineTo(Op, NewShift);
43735 }
43736 }
43737
43738 // If we are only demanding sign bits then we can use the shift source directly.
43739 unsigned NumSignBits =
43740 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43741 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43742 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43743 return TLO.CombineTo(Op, Op0);
43744
43745 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43746 TLO, Depth + 1))
43747 return true;
43748
43749 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43749, __extension__
__PRETTY_FUNCTION__))
;
43750 Known.Zero <<= ShAmt;
43751 Known.One <<= ShAmt;
43752
43753 // Low bits known zero.
43754 Known.Zero.setLowBits(ShAmt);
43755 return false;
43756 }
43757 case X86ISD::VSRLI: {
43758 unsigned ShAmt = Op.getConstantOperandVal(1);
43759 if (ShAmt >= BitWidth)
43760 break;
43761
43762 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43763
43764 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
43765 OriginalDemandedElts, Known, TLO, Depth + 1))
43766 return true;
43767
43768 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43768, __extension__
__PRETTY_FUNCTION__))
;
43769 Known.Zero.lshrInPlace(ShAmt);
43770 Known.One.lshrInPlace(ShAmt);
43771
43772 // High bits known zero.
43773 Known.Zero.setHighBits(ShAmt);
43774 return false;
43775 }
43776 case X86ISD::VSRAI: {
43777 SDValue Op0 = Op.getOperand(0);
43778 SDValue Op1 = Op.getOperand(1);
43779
43780 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
43781 if (ShAmt >= BitWidth)
43782 break;
43783
43784 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43785
43786 // If we just want the sign bit then we don't need to shift it.
43787 if (OriginalDemandedBits.isSignMask())
43788 return TLO.CombineTo(Op, Op0);
43789
43790 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43791 if (Op0.getOpcode() == X86ISD::VSHLI &&
43792 Op.getOperand(1) == Op0.getOperand(1)) {
43793 SDValue Op00 = Op0.getOperand(0);
43794 unsigned NumSignBits =
43795 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
43796 if (ShAmt < NumSignBits)
43797 return TLO.CombineTo(Op, Op00);
43798 }
43799
43800 // If any of the demanded bits are produced by the sign extension, we also
43801 // demand the input sign bit.
43802 if (OriginalDemandedBits.countl_zero() < ShAmt)
43803 DemandedMask.setSignBit();
43804
43805 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43806 TLO, Depth + 1))
43807 return true;
43808
43809 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43809, __extension__
__PRETTY_FUNCTION__))
;
43810 Known.Zero.lshrInPlace(ShAmt);
43811 Known.One.lshrInPlace(ShAmt);
43812
43813 // If the input sign bit is known to be zero, or if none of the top bits
43814 // are demanded, turn this into an unsigned shift right.
43815 if (Known.Zero[BitWidth - ShAmt - 1] ||
43816 OriginalDemandedBits.countl_zero() >= ShAmt)
43817 return TLO.CombineTo(
43818 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
43819
43820 // High bits are known one.
43821 if (Known.One[BitWidth - ShAmt - 1])
43822 Known.One.setHighBits(ShAmt);
43823 return false;
43824 }
43825 case X86ISD::BLENDV: {
43826 SDValue Sel = Op.getOperand(0);
43827 SDValue LHS = Op.getOperand(1);
43828 SDValue RHS = Op.getOperand(2);
43829
43830 APInt SignMask = APInt::getSignMask(BitWidth);
43831 SDValue NewSel = SimplifyMultipleUseDemandedBits(
43832 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
43833 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
43834 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43835 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
43836 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43837
43838 if (NewSel || NewLHS || NewRHS) {
43839 NewSel = NewSel ? NewSel : Sel;
43840 NewLHS = NewLHS ? NewLHS : LHS;
43841 NewRHS = NewRHS ? NewRHS : RHS;
43842 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
43843 NewSel, NewLHS, NewRHS));
43844 }
43845 break;
43846 }
43847 case X86ISD::PEXTRB:
43848 case X86ISD::PEXTRW: {
43849 SDValue Vec = Op.getOperand(0);
43850 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
43851 MVT VecVT = Vec.getSimpleValueType();
43852 unsigned NumVecElts = VecVT.getVectorNumElements();
43853
43854 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
43855 unsigned Idx = CIdx->getZExtValue();
43856 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
43857
43858 // If we demand no bits from the vector then we must have demanded
43859 // bits from the implict zext - simplify to zero.
43860 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
43861 if (DemandedVecBits == 0)
43862 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43863
43864 APInt KnownUndef, KnownZero;
43865 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
43866 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
43867 KnownZero, TLO, Depth + 1))
43868 return true;
43869
43870 KnownBits KnownVec;
43871 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
43872 KnownVec, TLO, Depth + 1))
43873 return true;
43874
43875 if (SDValue V = SimplifyMultipleUseDemandedBits(
43876 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
43877 return TLO.CombineTo(
43878 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
43879
43880 Known = KnownVec.zext(BitWidth);
43881 return false;
43882 }
43883 break;
43884 }
43885 case X86ISD::PINSRB:
43886 case X86ISD::PINSRW: {
43887 SDValue Vec = Op.getOperand(0);
43888 SDValue Scl = Op.getOperand(1);
43889 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43890 MVT VecVT = Vec.getSimpleValueType();
43891
43892 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
43893 unsigned Idx = CIdx->getZExtValue();
43894 if (!OriginalDemandedElts[Idx])
43895 return TLO.CombineTo(Op, Vec);
43896
43897 KnownBits KnownVec;
43898 APInt DemandedVecElts(OriginalDemandedElts);
43899 DemandedVecElts.clearBit(Idx);
43900 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
43901 KnownVec, TLO, Depth + 1))
43902 return true;
43903
43904 KnownBits KnownScl;
43905 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
43906 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
43907 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
43908 return true;
43909
43910 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
43911 Known = KnownBits::commonBits(KnownVec, KnownScl);
43912 return false;
43913 }
43914 break;
43915 }
43916 case X86ISD::PACKSS:
43917 // PACKSS saturates to MIN/MAX integer values. So if we just want the
43918 // sign bit then we can just ask for the source operands sign bit.
43919 // TODO - add known bits handling.
43920 if (OriginalDemandedBits.isSignMask()) {
43921 APInt DemandedLHS, DemandedRHS;
43922 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
43923
43924 KnownBits KnownLHS, KnownRHS;
43925 APInt SignMask = APInt::getSignMask(BitWidth * 2);
43926 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
43927 KnownLHS, TLO, Depth + 1))
43928 return true;
43929 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
43930 KnownRHS, TLO, Depth + 1))
43931 return true;
43932
43933 // Attempt to avoid multi-use ops if we don't need anything from them.
43934 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43935 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
43936 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
43937 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
43938 if (DemandedOp0 || DemandedOp1) {
43939 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
43940 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
43941 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
43942 }
43943 }
43944 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
43945 break;
43946 case X86ISD::VBROADCAST: {
43947 SDValue Src = Op.getOperand(0);
43948 MVT SrcVT = Src.getSimpleValueType();
43949 APInt DemandedElts = APInt::getOneBitSet(
43950 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
43951 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
43952 TLO, Depth + 1))
43953 return true;
43954 // If we don't need the upper bits, attempt to narrow the broadcast source.
43955 // Don't attempt this on AVX512 as it might affect broadcast folding.
43956 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
43957 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
43958 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
43959 Src->hasOneUse()) {
43960 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
43961 SDValue NewSrc =
43962 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
43963 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
43964 SDValue NewBcst =
43965 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
43966 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
43967 }
43968 break;
43969 }
43970 case X86ISD::PCMPGT:
43971 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43972 // iff we only need the sign bit then we can use R directly.
43973 if (OriginalDemandedBits.isSignMask() &&
43974 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43975 return TLO.CombineTo(Op, Op.getOperand(1));
43976 break;
43977 case X86ISD::MOVMSK: {
43978 SDValue Src = Op.getOperand(0);
43979 MVT SrcVT = Src.getSimpleValueType();
43980 unsigned SrcBits = SrcVT.getScalarSizeInBits();
43981 unsigned NumElts = SrcVT.getVectorNumElements();
43982
43983 // If we don't need the sign bits at all just return zero.
43984 if (OriginalDemandedBits.countr_zero() >= NumElts)
43985 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43986
43987 // See if we only demand bits from the lower 128-bit vector.
43988 if (SrcVT.is256BitVector() &&
43989 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
43990 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
43991 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43992 }
43993
43994 // Only demand the vector elements of the sign bits we need.
43995 APInt KnownUndef, KnownZero;
43996 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
43997 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43998 TLO, Depth + 1))
43999 return true;
44000
44001 Known.Zero = KnownZero.zext(BitWidth);
44002 Known.Zero.setHighBits(BitWidth - NumElts);
44003
44004 // MOVMSK only uses the MSB from each vector element.
44005 KnownBits KnownSrc;
44006 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44007 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44008 Depth + 1))
44009 return true;
44010
44011 if (KnownSrc.One[SrcBits - 1])
44012 Known.One.setLowBits(NumElts);
44013 else if (KnownSrc.Zero[SrcBits - 1])
44014 Known.Zero.setLowBits(NumElts);
44015
44016 // Attempt to avoid multi-use os if we don't need anything from it.
44017 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
44018 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44019 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44020 return false;
44021 }
44022 case X86ISD::TESTP: {
44023 SDValue Op0 = Op.getOperand(0);
44024 SDValue Op1 = Op.getOperand(1);
44025 MVT OpVT = Op0.getSimpleValueType();
44026 assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))
44027 OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))
44028 "Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))
;
44029
44030 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44031 KnownBits KnownSrc;
44032 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44033 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44034 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44035 AssumeSingleUse) ||
44036 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44037 AssumeSingleUse);
44038 }
44039 case X86ISD::BEXTR:
44040 case X86ISD::BEXTRI: {
44041 SDValue Op0 = Op.getOperand(0);
44042 SDValue Op1 = Op.getOperand(1);
44043
44044 // Only bottom 16-bits of the control bits are required.
44045 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44046 // NOTE: SimplifyDemandedBits won't do this for constants.
44047 uint64_t Val1 = Cst1->getZExtValue();
44048 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44049 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44050 SDLoc DL(Op);
44051 return TLO.CombineTo(
44052 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44053 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44054 }
44055
44056 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44057 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44058
44059 // If the length is 0, the result is 0.
44060 if (Length == 0) {
44061 Known.setAllZero();
44062 return false;
44063 }
44064
44065 if ((Shift + Length) <= BitWidth) {
44066 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44067 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44068 return true;
44069
44070 Known = Known.extractBits(Length, Shift);
44071 Known = Known.zextOrTrunc(BitWidth);
44072 return false;
44073 }
44074 } else {
44075 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44075, __extension__
__PRETTY_FUNCTION__))
;
44076 KnownBits Known1;
44077 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44078 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44079 return true;
44080
44081 // If the length is 0, replace with 0.
44082 KnownBits LengthBits = Known1.extractBits(8, 8);
44083 if (LengthBits.isZero())
44084 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44085 }
44086
44087 break;
44088 }
44089 case X86ISD::PDEP: {
44090 SDValue Op0 = Op.getOperand(0);
44091 SDValue Op1 = Op.getOperand(1);
44092
44093 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44094 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44095
44096 // If the demanded bits has leading zeroes, we don't demand those from the
44097 // mask.
44098 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44099 return true;
44100
44101 // The number of possible 1s in the mask determines the number of LSBs of
44102 // operand 0 used. Undemanded bits from the mask don't matter so filter
44103 // them before counting.
44104 KnownBits Known2;
44105 uint64_t Count = (~Known.Zero & LoMask).popcount();
44106 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44107 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44108 return true;
44109
44110 // Zeroes are retained from the mask, but not ones.
44111 Known.One.clearAllBits();
44112 // The result will have at least as many trailing zeros as the non-mask
44113 // operand since bits can only map to the same or higher bit position.
44114 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44115 return false;
44116 }
44117 }
44118
44119 return TargetLowering::SimplifyDemandedBitsForTargetNode(
44120 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44121}
44122
44123SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
44124 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
44125 SelectionDAG &DAG, unsigned Depth) const {
44126 int NumElts = DemandedElts.getBitWidth();
44127 unsigned Opc = Op.getOpcode();
44128 EVT VT = Op.getValueType();
44129
44130 switch (Opc) {
44131 case X86ISD::PINSRB:
44132 case X86ISD::PINSRW: {
44133 // If we don't demand the inserted element, return the base vector.
44134 SDValue Vec = Op.getOperand(0);
44135 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44136 MVT VecVT = Vec.getSimpleValueType();
44137 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44138 !DemandedElts[CIdx->getZExtValue()])
44139 return Vec;
44140 break;
44141 }
44142 case X86ISD::VSHLI: {
44143 // If we are only demanding sign bits then we can use the shift source
44144 // directly.
44145 SDValue Op0 = Op.getOperand(0);
44146 unsigned ShAmt = Op.getConstantOperandVal(1);
44147 unsigned BitWidth = DemandedBits.getBitWidth();
44148 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44149 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44150 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44151 return Op0;
44152 break;
44153 }
44154 case X86ISD::VSRAI:
44155 // iff we only need the sign bit then we can use the source directly.
44156 // TODO: generalize where we only demand extended signbits.
44157 if (DemandedBits.isSignMask())
44158 return Op.getOperand(0);
44159 break;
44160 case X86ISD::PCMPGT:
44161 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44162 // iff we only need the sign bit then we can use R directly.
44163 if (DemandedBits.isSignMask() &&
44164 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44165 return Op.getOperand(1);
44166 break;
44167 case X86ISD::ANDNP: {
44168 // ANDNP = (~LHS & RHS);
44169 SDValue LHS = Op.getOperand(0);
44170 SDValue RHS = Op.getOperand(1);
44171
44172 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44173 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44174
44175 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
44176 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
44177 // this context, so return RHS.
44178 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
44179 return RHS;
44180 break;
44181 }
44182 }
44183
44184 APInt ShuffleUndef, ShuffleZero;
44185 SmallVector<int, 16> ShuffleMask;
44186 SmallVector<SDValue, 2> ShuffleOps;
44187 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
44188 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44189 // If all the demanded elts are from one operand and are inline,
44190 // then we can use the operand directly.
44191 int NumOps = ShuffleOps.size();
44192 if (ShuffleMask.size() == (unsigned)NumElts &&
44193 llvm::all_of(ShuffleOps, [VT](SDValue V) {
44194 return VT.getSizeInBits() == V.getValueSizeInBits();
44195 })) {
44196
44197 if (DemandedElts.isSubsetOf(ShuffleUndef))
44198 return DAG.getUNDEF(VT);
44199 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
44200 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
44201
44202 // Bitmask that indicates which ops have only been accessed 'inline'.
44203 APInt IdentityOp = APInt::getAllOnes(NumOps);
44204 for (int i = 0; i != NumElts; ++i) {
44205 int M = ShuffleMask[i];
44206 if (!DemandedElts[i] || ShuffleUndef[i])
44207 continue;
44208 int OpIdx = M / NumElts;
44209 int EltIdx = M % NumElts;
44210 if (M < 0 || EltIdx != i) {
44211 IdentityOp.clearAllBits();
44212 break;
44213 }
44214 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
44215 if (IdentityOp == 0)
44216 break;
44217 }
44218 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__
__PRETTY_FUNCTION__))
44219 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__
__PRETTY_FUNCTION__))
;
44220
44221 if (IdentityOp != 0)
44222 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
44223 }
44224 }
44225
44226 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
44227 Op, DemandedBits, DemandedElts, DAG, Depth);
44228}
44229
44230bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
44231 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44232 bool PoisonOnly, unsigned Depth) const {
44233 unsigned EltsBits = Op.getScalarValueSizeInBits();
44234 unsigned NumElts = DemandedElts.getBitWidth();
44235
44236 // TODO: Add more target shuffles.
44237 switch (Op.getOpcode()) {
44238 case X86ISD::PSHUFD:
44239 case X86ISD::VPERMILPI: {
44240 SmallVector<int, 8> Mask;
44241 DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
44242
44243 APInt DemandedSrcElts = APInt::getZero(NumElts);
44244 for (unsigned I = 0; I != NumElts; ++I)
44245 if (DemandedElts[I])
44246 DemandedSrcElts.setBit(Mask[I]);
44247
44248 return DAG.isGuaranteedNotToBeUndefOrPoison(
44249 Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
44250 }
44251 }
44252 return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
44253 Op, DemandedElts, DAG, PoisonOnly, Depth);
44254}
44255
44256bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
44257 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44258 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44259
44260 // TODO: Add more target shuffles.
44261 switch (Op.getOpcode()) {
44262 case X86ISD::PSHUFD:
44263 case X86ISD::VPERMILPI:
44264 return false;
44265 }
44266 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
44267 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44268}
44269
44270bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
44271 const APInt &DemandedElts,
44272 APInt &UndefElts,
44273 const SelectionDAG &DAG,
44274 unsigned Depth) const {
44275 unsigned NumElts = DemandedElts.getBitWidth();
44276 unsigned Opc = Op.getOpcode();
44277
44278 switch (Opc) {
44279 case X86ISD::VBROADCAST:
44280 case X86ISD::VBROADCAST_LOAD:
44281 UndefElts = APInt::getZero(NumElts);
44282 return true;
44283 }
44284
44285 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44286 DAG, Depth);
44287}
44288
44289// Helper to peek through bitops/trunc/setcc to determine size of source vector.
44290// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44291static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44292 bool AllowTruncate) {
44293 switch (Src.getOpcode()) {
44294 case ISD::TRUNCATE:
44295 if (!AllowTruncate)
44296 return false;
44297 [[fallthrough]];
44298 case ISD::SETCC:
44299 return Src.getOperand(0).getValueSizeInBits() == Size;
44300 case ISD::AND:
44301 case ISD::XOR:
44302 case ISD::OR:
44303 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44304 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44305 case ISD::SELECT:
44306 case ISD::VSELECT:
44307 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44308 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44309 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44310 case ISD::BUILD_VECTOR:
44311 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44312 ISD::isBuildVectorAllOnes(Src.getNode());
44313 }
44314 return false;
44315}
44316
44317// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44318static unsigned getAltBitOpcode(unsigned Opcode) {
44319 switch(Opcode) {
44320 case ISD::AND: return X86ISD::FAND;
44321 case ISD::OR: return X86ISD::FOR;
44322 case ISD::XOR: return X86ISD::FXOR;
44323 case X86ISD::ANDNP: return X86ISD::FANDN;
44324 }
44325 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44325)
;
44326}
44327
44328// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44329static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
44330 const SDLoc &DL) {
44331 EVT SrcVT = Src.getValueType();
44332 if (SrcVT != MVT::v4i1)
44333 return SDValue();
44334
44335 switch (Src.getOpcode()) {
44336 case ISD::SETCC:
44337 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44338 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44339 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44340 SDValue Op0 = Src.getOperand(0);
44341 if (ISD::isNormalLoad(Op0.getNode()))
44342 return DAG.getBitcast(MVT::v4f32, Op0);
44343 if (Op0.getOpcode() == ISD::BITCAST &&
44344 Op0.getOperand(0).getValueType() == MVT::v4f32)
44345 return Op0.getOperand(0);
44346 }
44347 break;
44348 case ISD::AND:
44349 case ISD::XOR:
44350 case ISD::OR: {
44351 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44352 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44353 if (Op0 && Op1)
44354 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44355 Op1);
44356 break;
44357 }
44358 }
44359 return SDValue();
44360}
44361
44362// Helper to push sign extension of vXi1 SETCC result through bitops.
44363static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
44364 SDValue Src, const SDLoc &DL) {
44365 switch (Src.getOpcode()) {
44366 case ISD::SETCC:
44367 case ISD::TRUNCATE:
44368 case ISD::BUILD_VECTOR:
44369 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44370 case ISD::AND:
44371 case ISD::XOR:
44372 case ISD::OR:
44373 return DAG.getNode(
44374 Src.getOpcode(), DL, SExtVT,
44375 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44376 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44377 case ISD::SELECT:
44378 case ISD::VSELECT:
44379 return DAG.getSelect(
44380 DL, SExtVT, Src.getOperand(0),
44381 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44382 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44383 }
44384 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44384)
;
44385}
44386
44387// Try to match patterns such as
44388// (i16 bitcast (v16i1 x))
44389// ->
44390// (i16 movmsk (16i8 sext (v16i1 x)))
44391// before the illegal vector is scalarized on subtargets that don't have legal
44392// vxi1 types.
44393static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
44394 const SDLoc &DL,
44395 const X86Subtarget &Subtarget) {
44396 EVT SrcVT = Src.getValueType();
44397 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44398 return SDValue();
44399
44400 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44401 // legalization destroys the v4i32 type.
44402 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44403 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44404 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44405 DAG.getBitcast(MVT::v4f32, V));
44406 return DAG.getZExtOrTrunc(V, DL, VT);
44407 }
44408 }
44409
44410 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44411 // movmskb even with avx512. This will be better than truncating to vXi1 and
44412 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44413 // vpcmpeqb/vpcmpgtb.
44414 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44415 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44416 Src.getOperand(0).getValueType() == MVT::v32i8 ||
44417 Src.getOperand(0).getValueType() == MVT::v64i8);
44418
44419 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44420 // directly with vpmovmskb/vmovmskps/vmovmskpd.
44421 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44422 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44423 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44424 EVT CmpVT = Src.getOperand(0).getValueType();
44425 EVT EltVT = CmpVT.getVectorElementType();
44426 if (CmpVT.getSizeInBits() <= 256 &&
44427 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44428 PreferMovMsk = true;
44429 }
44430
44431 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44432 // MOVMSK is supported in SSE2 or later.
44433 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44434 return SDValue();
44435
44436 // If the upper ops of a concatenation are undef, then try to bitcast the
44437 // lower op and extend.
44438 SmallVector<SDValue, 4> SubSrcOps;
44439 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44440 SubSrcOps.size() >= 2) {
44441 SDValue LowerOp = SubSrcOps[0];
44442 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44443 if (LowerOp.getOpcode() == ISD::SETCC &&
44444 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44445 EVT SubVT = VT.getIntegerVT(
44446 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44447 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44448 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44449 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44450 }
44451 }
44452 }
44453
44454 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44455 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44456 // v8i16 and v16i16.
44457 // For these two cases, we can shuffle the upper element bytes to a
44458 // consecutive sequence at the start of the vector and treat the results as
44459 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44460 // for v16i16 this is not the case, because the shuffle is expensive, so we
44461 // avoid sign-extending to this type entirely.
44462 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44463 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44464 MVT SExtVT;
44465 bool PropagateSExt = false;
44466 switch (SrcVT.getSimpleVT().SimpleTy) {
44467 default:
44468 return SDValue();
44469 case MVT::v2i1:
44470 SExtVT = MVT::v2i64;
44471 break;
44472 case MVT::v4i1:
44473 SExtVT = MVT::v4i32;
44474 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44475 // sign-extend to a 256-bit operation to avoid truncation.
44476 if (Subtarget.hasAVX() &&
44477 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44478 SExtVT = MVT::v4i64;
44479 PropagateSExt = true;
44480 }
44481 break;
44482 case MVT::v8i1:
44483 SExtVT = MVT::v8i16;
44484 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44485 // sign-extend to a 256-bit operation to match the compare.
44486 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44487 // 256-bit because the shuffle is cheaper than sign extending the result of
44488 // the compare.
44489 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44490 checkBitcastSrcVectorSize(Src, 512, true))) {
44491 SExtVT = MVT::v8i32;
44492 PropagateSExt = true;
44493 }
44494 break;
44495 case MVT::v16i1:
44496 SExtVT = MVT::v16i8;
44497 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44498 // it is not profitable to sign-extend to 256-bit because this will
44499 // require an extra cross-lane shuffle which is more expensive than
44500 // truncating the result of the compare to 128-bits.
44501 break;
44502 case MVT::v32i1:
44503 SExtVT = MVT::v32i8;
44504 break;
44505 case MVT::v64i1:
44506 // If we have AVX512F, but not AVX512BW and the input is truncated from
44507 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44508 if (Subtarget.hasAVX512()) {
44509 if (Subtarget.hasBWI())
44510 return SDValue();
44511 SExtVT = MVT::v64i8;
44512 break;
44513 }
44514 // Split if this is a <64 x i8> comparison result.
44515 if (checkBitcastSrcVectorSize(Src, 512, false)) {
44516 SExtVT = MVT::v64i8;
44517 break;
44518 }
44519 return SDValue();
44520 };
44521
44522 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44523 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44524
44525 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44526 V = getPMOVMSKB(DL, V, DAG, Subtarget);
44527 } else {
44528 if (SExtVT == MVT::v8i16)
44529 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
44530 DAG.getUNDEF(MVT::v8i16));
44531 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44532 }
44533
44534 EVT IntVT =
44535 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
44536 V = DAG.getZExtOrTrunc(V, DL, IntVT);
44537 return DAG.getBitcast(VT, V);
44538}
44539
44540// Convert a vXi1 constant build vector to the same width scalar integer.
44541static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
44542 EVT SrcVT = Op.getValueType();
44543 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__
__PRETTY_FUNCTION__))
44544 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__
__PRETTY_FUNCTION__))
;
44545 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__
__PRETTY_FUNCTION__))
44546 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__
__PRETTY_FUNCTION__))
;
44547
44548 APInt Imm(SrcVT.getVectorNumElements(), 0);
44549 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44550 SDValue In = Op.getOperand(Idx);
44551 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
44552 Imm.setBit(Idx);
44553 }
44554 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44555 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44556}
44557
44558static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44559 TargetLowering::DAGCombinerInfo &DCI,
44560 const X86Subtarget &Subtarget) {
44561 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44561, __extension__
__PRETTY_FUNCTION__))
;
44562
44563 if (!DCI.isBeforeLegalizeOps())
44564 return SDValue();
44565
44566 // Only do this if we have k-registers.
44567 if (!Subtarget.hasAVX512())
44568 return SDValue();
44569
44570 EVT DstVT = N->getValueType(0);
44571 SDValue Op = N->getOperand(0);
44572 EVT SrcVT = Op.getValueType();
44573
44574 if (!Op.hasOneUse())
44575 return SDValue();
44576
44577 // Look for logic ops.
44578 if (Op.getOpcode() != ISD::AND &&
44579 Op.getOpcode() != ISD::OR &&
44580 Op.getOpcode() != ISD::XOR)
44581 return SDValue();
44582
44583 // Make sure we have a bitcast between mask registers and a scalar type.
44584 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44585 DstVT.isScalarInteger()) &&
44586 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44587 SrcVT.isScalarInteger()))
44588 return SDValue();
44589
44590 SDValue LHS = Op.getOperand(0);
44591 SDValue RHS = Op.getOperand(1);
44592
44593 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44594 LHS.getOperand(0).getValueType() == DstVT)
44595 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44596 DAG.getBitcast(DstVT, RHS));
44597
44598 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44599 RHS.getOperand(0).getValueType() == DstVT)
44600 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44601 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44602
44603 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44604 // Most of these have to move a constant from the scalar domain anyway.
44605 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
44606 RHS = combinevXi1ConstantToInteger(RHS, DAG);
44607 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44608 DAG.getBitcast(DstVT, LHS), RHS);
44609 }
44610
44611 return SDValue();
44612}
44613
44614static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
44615 const X86Subtarget &Subtarget) {
44616 SDLoc DL(BV);
44617 unsigned NumElts = BV->getNumOperands();
44618 SDValue Splat = BV->getSplatValue();
44619
44620 // Build MMX element from integer GPR or SSE float values.
44621 auto CreateMMXElement = [&](SDValue V) {
44622 if (V.isUndef())
44623 return DAG.getUNDEF(MVT::x86mmx);
44624 if (V.getValueType().isFloatingPoint()) {
44625 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44626 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44627 V = DAG.getBitcast(MVT::v2i64, V);
44628 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44629 }
44630 V = DAG.getBitcast(MVT::i32, V);
44631 } else {
44632 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44633 }
44634 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44635 };
44636
44637 // Convert build vector ops to MMX data in the bottom elements.
44638 SmallVector<SDValue, 8> Ops;
44639
44640 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44641
44642 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44643 if (Splat) {
44644 if (Splat.isUndef())
44645 return DAG.getUNDEF(MVT::x86mmx);
44646
44647 Splat = CreateMMXElement(Splat);
44648
44649 if (Subtarget.hasSSE1()) {
44650 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44651 if (NumElts == 8)
44652 Splat = DAG.getNode(
44653 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44654 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44655 TLI.getPointerTy(DAG.getDataLayout())),
44656 Splat, Splat);
44657
44658 // Use PSHUFW to repeat 16-bit elements.
44659 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44660 return DAG.getNode(
44661 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44662 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44663 TLI.getPointerTy(DAG.getDataLayout())),
44664 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44665 }
44666 Ops.append(NumElts, Splat);
44667 } else {
44668 for (unsigned i = 0; i != NumElts; ++i)
44669 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44670 }
44671
44672 // Use tree of PUNPCKLs to build up general MMX vector.
44673 while (Ops.size() > 1) {
44674 unsigned NumOps = Ops.size();
44675 unsigned IntrinOp =
44676 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44677 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44678 : Intrinsic::x86_mmx_punpcklbw));
44679 SDValue Intrin = DAG.getTargetConstant(
44680 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44681 for (unsigned i = 0; i != NumOps; i += 2)
44682 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44683 Ops[i], Ops[i + 1]);
44684 Ops.resize(NumOps / 2);
44685 }
44686
44687 return Ops[0];
44688}
44689
44690// Recursive function that attempts to find if a bool vector node was originally
44691// a vector/float/double that got truncated/extended/bitcast to/from a scalar
44692// integer. If so, replace the scalar ops with bool vector equivalents back down
44693// the chain.
44694static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
44695 SelectionDAG &DAG,
44696 const X86Subtarget &Subtarget) {
44697 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44698 unsigned Opc = V.getOpcode();
44699 switch (Opc) {
44700 case ISD::BITCAST: {
44701 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
44702 SDValue Src = V.getOperand(0);
44703 EVT SrcVT = Src.getValueType();
44704 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
44705 return DAG.getBitcast(VT, Src);
44706 break;
44707 }
44708 case ISD::TRUNCATE: {
44709 // If we find a suitable source, a truncated scalar becomes a subvector.
44710 SDValue Src = V.getOperand(0);
44711 EVT NewSrcVT =
44712 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
44713 if (TLI.isTypeLegal(NewSrcVT))
44714 if (SDValue N0 =
44715 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44716 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
44717 DAG.getIntPtrConstant(0, DL));
44718 break;
44719 }
44720 case ISD::ANY_EXTEND:
44721 case ISD::ZERO_EXTEND: {
44722 // If we find a suitable source, an extended scalar becomes a subvector.
44723 SDValue Src = V.getOperand(0);
44724 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
44725 Src.getScalarValueSizeInBits());
44726 if (TLI.isTypeLegal(NewSrcVT))
44727 if (SDValue N0 =
44728 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44729 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
44730 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
44731 : DAG.getConstant(0, DL, VT),
44732 N0, DAG.getIntPtrConstant(0, DL));
44733 break;
44734 }
44735 case ISD::OR: {
44736 // If we find suitable sources, we can just move an OR to the vector domain.
44737 SDValue Src0 = V.getOperand(0);
44738 SDValue Src1 = V.getOperand(1);
44739 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44740 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
44741 return DAG.getNode(Opc, DL, VT, N0, N1);
44742 break;
44743 }
44744 case ISD::SHL: {
44745 // If we find a suitable source, a SHL becomes a KSHIFTL.
44746 SDValue Src0 = V.getOperand(0);
44747 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
44748 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
44749 break;
44750
44751 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
44752 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44753 return DAG.getNode(
44754 X86ISD::KSHIFTL, DL, VT, N0,
44755 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
44756 break;
44757 }
44758 }
44759 return SDValue();
44760}
44761
44762static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
44763 TargetLowering::DAGCombinerInfo &DCI,
44764 const X86Subtarget &Subtarget) {
44765 SDValue N0 = N->getOperand(0);
44766 EVT VT = N->getValueType(0);
44767 EVT SrcVT = N0.getValueType();
44768 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44769
44770 // Try to match patterns such as
44771 // (i16 bitcast (v16i1 x))
44772 // ->
44773 // (i16 movmsk (16i8 sext (v16i1 x)))
44774 // before the setcc result is scalarized on subtargets that don't have legal
44775 // vxi1 types.
44776 if (DCI.isBeforeLegalize()) {
44777 SDLoc dl(N);
44778 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
44779 return V;
44780
44781 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44782 // type, widen both sides to avoid a trip through memory.
44783 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
44784 Subtarget.hasAVX512()) {
44785 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
44786 N0 = DAG.getBitcast(MVT::v8i1, N0);
44787 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
44788 DAG.getIntPtrConstant(0, dl));
44789 }
44790
44791 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44792 // type, widen both sides to avoid a trip through memory.
44793 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
44794 Subtarget.hasAVX512()) {
44795 // Use zeros for the widening if we already have some zeroes. This can
44796 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
44797 // stream of this.
44798 // FIXME: It might make sense to detect a concat_vectors with a mix of
44799 // zeroes and undef and turn it into insert_subvector for i1 vectors as
44800 // a separate combine. What we can't do is canonicalize the operands of
44801 // such a concat or we'll get into a loop with SimplifyDemandedBits.
44802 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
44803 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
44804 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
44805 SrcVT = LastOp.getValueType();
44806 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44807 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
44808 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
44809 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44810 N0 = DAG.getBitcast(MVT::i8, N0);
44811 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44812 }
44813 }
44814
44815 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44816 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
44817 Ops[0] = N0;
44818 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44819 N0 = DAG.getBitcast(MVT::i8, N0);
44820 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44821 }
44822 } else {
44823 // If we're bitcasting from iX to vXi1, see if the integer originally
44824 // began as a vXi1 and whether we can remove the bitcast entirely.
44825 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
44826 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
44827 if (SDValue V =
44828 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
44829 return V;
44830 }
44831 }
44832
44833 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
44834 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
44835 // due to insert_subvector legalization on KNL. By promoting the copy to i16
44836 // we can help with known bits propagation from the vXi1 domain to the
44837 // scalar domain.
44838 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
44839 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44840 N0.getOperand(0).getValueType() == MVT::v16i1 &&
44841 isNullConstant(N0.getOperand(1)))
44842 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
44843 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
44844
44845 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
44846 // and the vbroadcast_load are both integer or both fp. In some cases this
44847 // will remove the bitcast entirely.
44848 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
44849 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
44850 auto *BCast = cast<MemIntrinsicSDNode>(N0);
44851 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
44852 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
44853 // Don't swap i8/i16 since don't have fp types that size.
44854 if (MemSize >= 32) {
44855 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
44856 : MVT::getIntegerVT(MemSize);
44857 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
44858 : MVT::getIntegerVT(SrcVTSize);
44859 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
44860
44861 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
44862 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
44863 SDValue ResNode =
44864 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
44865 MemVT, BCast->getMemOperand());
44866 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
44867 return DAG.getBitcast(VT, ResNode);
44868 }
44869 }
44870
44871 // Since MMX types are special and don't usually play with other vector types,
44872 // it's better to handle them early to be sure we emit efficient code by
44873 // avoiding store-load conversions.
44874 if (VT == MVT::x86mmx) {
44875 // Detect MMX constant vectors.
44876 APInt UndefElts;
44877 SmallVector<APInt, 1> EltBits;
44878 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
44879 SDLoc DL(N0);
44880 // Handle zero-extension of i32 with MOVD.
44881 if (EltBits[0].countl_zero() >= 32)
44882 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
44883 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
44884 // Else, bitcast to a double.
44885 // TODO - investigate supporting sext 32-bit immediates on x86_64.
44886 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
44887 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
44888 }
44889
44890 // Detect bitcasts to x86mmx low word.
44891 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44892 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
44893 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
44894 bool LowUndef = true, AllUndefOrZero = true;
44895 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
44896 SDValue Op = N0.getOperand(i);
44897 LowUndef &= Op.isUndef() || (i >= e/2);
44898 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
44899 }
44900 if (AllUndefOrZero) {
44901 SDValue N00 = N0.getOperand(0);
44902 SDLoc dl(N00);
44903 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
44904 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
44905 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
44906 }
44907 }
44908
44909 // Detect bitcasts of 64-bit build vectors and convert to a
44910 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
44911 // lowest element.
44912 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44913 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
44914 SrcVT == MVT::v8i8))
44915 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
44916
44917 // Detect bitcasts between element or subvector extraction to x86mmx.
44918 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
44919 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
44920 isNullConstant(N0.getOperand(1))) {
44921 SDValue N00 = N0.getOperand(0);
44922 if (N00.getValueType().is128BitVector())
44923 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
44924 DAG.getBitcast(MVT::v2i64, N00));
44925 }
44926
44927 // Detect bitcasts from FP_TO_SINT to x86mmx.
44928 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
44929 SDLoc DL(N0);
44930 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
44931 DAG.getUNDEF(MVT::v2i32));
44932 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
44933 DAG.getBitcast(MVT::v2i64, Res));
44934 }
44935 }
44936
44937 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
44938 // most of these to scalar anyway.
44939 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
44940 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44941 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
44942 return combinevXi1ConstantToInteger(N0, DAG);
44943 }
44944
44945 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44946 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44947 isa<ConstantSDNode>(N0)) {
44948 auto *C = cast<ConstantSDNode>(N0);
44949 if (C->isAllOnes())
44950 return DAG.getConstant(1, SDLoc(N0), VT);
44951 if (C->isZero())
44952 return DAG.getConstant(0, SDLoc(N0), VT);
44953 }
44954
44955 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
44956 // Turn it into a sign bit compare that produces a k-register. This avoids
44957 // a trip through a GPR.
44958 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44959 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44960 isPowerOf2_32(VT.getVectorNumElements())) {
44961 unsigned NumElts = VT.getVectorNumElements();
44962 SDValue Src = N0;
44963
44964 // Peek through truncate.
44965 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
44966 Src = N0.getOperand(0);
44967
44968 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
44969 SDValue MovmskIn = Src.getOperand(0);
44970 MVT MovmskVT = MovmskIn.getSimpleValueType();
44971 unsigned MovMskElts = MovmskVT.getVectorNumElements();
44972
44973 // We allow extra bits of the movmsk to be used since they are known zero.
44974 // We can't convert a VPMOVMSKB without avx512bw.
44975 if (MovMskElts <= NumElts &&
44976 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
44977 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
44978 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
44979 SDLoc dl(N);
44980 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
44981 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
44982 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
44983 if (EVT(CmpVT) == VT)
44984 return Cmp;
44985
44986 // Pad with zeroes up to original VT to replace the zeroes that were
44987 // being used from the MOVMSK.
44988 unsigned NumConcats = NumElts / MovMskElts;
44989 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
44990 Ops[0] = Cmp;
44991 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
44992 }
44993 }
44994 }
44995
44996 // Try to remove bitcasts from input and output of mask arithmetic to
44997 // remove GPR<->K-register crossings.
44998 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
44999 return V;
45000
45001 // Convert a bitcasted integer logic operation that has one bitcasted
45002 // floating-point operand into a floating-point logic operation. This may
45003 // create a load of a constant, but that is cheaper than materializing the
45004 // constant in an integer register and transferring it to an SSE register or
45005 // transferring the SSE operand to integer register and back.
45006 unsigned FPOpcode;
45007 switch (N0.getOpcode()) {
45008 case ISD::AND: FPOpcode = X86ISD::FAND; break;
45009 case ISD::OR: FPOpcode = X86ISD::FOR; break;
45010 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45011 default: return SDValue();
45012 }
45013
45014 // Check if we have a bitcast from another integer type as well.
45015 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
45016 (Subtarget.hasSSE2() && VT == MVT::f64) ||
45017 (Subtarget.hasFP16() && VT == MVT::f16) ||
45018 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
45019 TLI.isTypeLegal(VT))))
45020 return SDValue();
45021
45022 SDValue LogicOp0 = N0.getOperand(0);
45023 SDValue LogicOp1 = N0.getOperand(1);
45024 SDLoc DL0(N0);
45025
45026 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45027 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
45028 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
45029 LogicOp0.getOperand(0).getValueType() == VT &&
45030 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
45031 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
45032 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45033 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
45034 }
45035 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45036 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
45037 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
45038 LogicOp1.getOperand(0).getValueType() == VT &&
45039 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
45040 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
45041 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45042 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
45043 }
45044
45045 return SDValue();
45046}
45047
45048// (mul (zext a), (sext, b))
45049static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
45050 SDValue &Op1) {
45051 Op0 = Mul.getOperand(0);
45052 Op1 = Mul.getOperand(1);
45053
45054 // The operand1 should be signed extend
45055 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
45056 std::swap(Op0, Op1);
45057
45058 auto IsFreeTruncation = [](SDValue &Op) -> bool {
45059 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
45060 Op.getOpcode() == ISD::SIGN_EXTEND) &&
45061 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
45062 return true;
45063
45064 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
45065 return (BV && BV->isConstant());
45066 };
45067
45068 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
45069 // value, we need to check Op0 is zero extended value. Op1 should be signed
45070 // value, so we just check the signed bits.
45071 if ((IsFreeTruncation(Op0) &&
45072 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
45073 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
45074 return true;
45075
45076 return false;
45077}
45078
45079// Given a ABS node, detect the following pattern:
45080// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
45081// This is useful as it is the input into a SAD pattern.
45082static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
45083 SDValue AbsOp1 = Abs->getOperand(0);
45084 if (AbsOp1.getOpcode() != ISD::SUB)
45085 return false;
45086
45087 Op0 = AbsOp1.getOperand(0);
45088 Op1 = AbsOp1.getOperand(1);
45089
45090 // Check if the operands of the sub are zero-extended from vectors of i8.
45091 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
45092 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
45093 Op1.getOpcode() != ISD::ZERO_EXTEND ||
45094 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
45095 return false;
45096
45097 return true;
45098}
45099
45100static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
45101 unsigned &LogBias, const SDLoc &DL,
45102 const X86Subtarget &Subtarget) {
45103 // Extend or truncate to MVT::i8 first.
45104 MVT Vi8VT =
45105 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
45106 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
45107 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
45108
45109 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
45110 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
45111 // The src A, B element type is i8, but the dst C element type is i32.
45112 // When we calculate the reduce stage, we use src vector type vXi8 for it
45113 // so we need logbias 2 to avoid extra 2 stages.
45114 LogBias = 2;
45115
45116 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
45117 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
45118 RegSize = std::max(512u, RegSize);
45119
45120 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45121 // fill in the missing vector elements with 0.
45122 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
45123 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
45124 Ops[0] = LHS;
45125 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45126 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45127 Ops[0] = RHS;
45128 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45129
45130 // Actually build the DotProduct, split as 256/512 bits for
45131 // AVXVNNI/AVX512VNNI.
45132 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45133 ArrayRef<SDValue> Ops) {
45134 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45135 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
45136 };
45137 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
45138 SDValue Zero = DAG.getConstant(0, DL, DpVT);
45139
45140 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
45141 DpBuilder, false);
45142}
45143
45144// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
45145// to these zexts.
45146static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
45147 const SDValue &Zext1, const SDLoc &DL,
45148 const X86Subtarget &Subtarget) {
45149 // Find the appropriate width for the PSADBW.
45150 EVT InVT = Zext0.getOperand(0).getValueType();
45151 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
45152
45153 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45154 // fill in the missing vector elements with 0.
45155 unsigned NumConcat = RegSize / InVT.getSizeInBits();
45156 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
45157 Ops[0] = Zext0.getOperand(0);
45158 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45159 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45160 Ops[0] = Zext1.getOperand(0);
45161 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45162
45163 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45164 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45165 ArrayRef<SDValue> Ops) {
45166 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45167 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
45168 };
45169 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
45170 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
45171 PSADBWBuilder);
45172}
45173
45174// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
45175// PHMINPOSUW.
45176static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
45177 const X86Subtarget &Subtarget) {
45178 // Bail without SSE41.
45179 if (!Subtarget.hasSSE41())
45180 return SDValue();
45181
45182 EVT ExtractVT = Extract->getValueType(0);
45183 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
45184 return SDValue();
45185
45186 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
45187 ISD::NodeType BinOp;
45188 SDValue Src = DAG.matchBinOpReduction(
45189 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
45190 if (!Src)
45191 return SDValue();
45192
45193 EVT SrcVT = Src.getValueType();
45194 EVT SrcSVT = SrcVT.getScalarType();
45195 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
45196 return SDValue();
45197
45198 SDLoc DL(Extract);
45199 SDValue MinPos = Src;
45200
45201 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45202 while (SrcVT.getSizeInBits() > 128) {
45203 SDValue Lo, Hi;
45204 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
45205 SrcVT = Lo.getValueType();
45206 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
45207 }
45208 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))
45209 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))
45210 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))
;
45211
45212 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
45213 // to flip the value accordingly.
45214 SDValue Mask;
45215 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
45216 if (BinOp == ISD::SMAX)
45217 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
45218 else if (BinOp == ISD::SMIN)
45219 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
45220 else if (BinOp == ISD::UMAX)
45221 Mask = DAG.getAllOnesConstant(DL, SrcVT);
45222
45223 if (Mask)
45224 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45225
45226 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
45227 // shuffling each upper element down and insert zeros. This means that the
45228 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45229 // ready for the PHMINPOS.
45230 if (ExtractVT == MVT::i8) {
45231 SDValue Upper = DAG.getVectorShuffle(
45232 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
45233 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
45234 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
45235 }
45236
45237 // Perform the PHMINPOS on a v8i16 vector,
45238 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
45239 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
45240 MinPos = DAG.getBitcast(SrcVT, MinPos);
45241
45242 if (Mask)
45243 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45244
45245 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
45246 DAG.getIntPtrConstant(0, DL));
45247}
45248
45249// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
45250static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
45251 const X86Subtarget &Subtarget) {
45252 // Bail without SSE2.
45253 if (!Subtarget.hasSSE2())
45254 return SDValue();
45255
45256 EVT ExtractVT = Extract->getValueType(0);
45257 unsigned BitWidth = ExtractVT.getSizeInBits();
45258 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
45259 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
45260 return SDValue();
45261
45262 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
45263 ISD::NodeType BinOp;
45264 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
45265 if (!Match && ExtractVT == MVT::i1)
45266 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
45267 if (!Match)
45268 return SDValue();
45269
45270 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
45271 // which we can't support here for now.
45272 if (Match.getScalarValueSizeInBits() != BitWidth)
45273 return SDValue();
45274
45275 SDValue Movmsk;
45276 SDLoc DL(Extract);
45277 EVT MatchVT = Match.getValueType();
45278 unsigned NumElts = MatchVT.getVectorNumElements();
45279 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45280 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45281 LLVMContext &Ctx = *DAG.getContext();
45282
45283 if (ExtractVT == MVT::i1) {
45284 // Special case for (pre-legalization) vXi1 reductions.
45285 if (NumElts > 64 || !isPowerOf2_32(NumElts))
45286 return SDValue();
45287 if (Match.getOpcode() == ISD::SETCC) {
45288 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45289 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45290 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45291 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45292 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45293 X86::CondCode X86CC;
45294 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45295 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45296 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45297 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45298 DAG, X86CC))
45299 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45300 getSETCC(X86CC, V, DL, DAG));
45301 }
45302 }
45303 if (TLI.isTypeLegal(MatchVT)) {
45304 // If this is a legal AVX512 predicate type then we can just bitcast.
45305 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45306 Movmsk = DAG.getBitcast(MovmskVT, Match);
45307 } else {
45308 // Use combineBitcastvxi1 to create the MOVMSK.
45309 while (NumElts > MaxElts) {
45310 SDValue Lo, Hi;
45311 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45312 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45313 NumElts /= 2;
45314 }
45315 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45316 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45317 }
45318 if (!Movmsk)
45319 return SDValue();
45320 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45321 } else {
45322 // FIXME: Better handling of k-registers or 512-bit vectors?
45323 unsigned MatchSizeInBits = Match.getValueSizeInBits();
45324 if (!(MatchSizeInBits == 128 ||
45325 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45326 return SDValue();
45327
45328 // Make sure this isn't a vector of 1 element. The perf win from using
45329 // MOVMSK diminishes with less elements in the reduction, but it is
45330 // generally better to get the comparison over to the GPRs as soon as
45331 // possible to reduce the number of vector ops.
45332 if (Match.getValueType().getVectorNumElements() < 2)
45333 return SDValue();
45334
45335 // Check that we are extracting a reduction of all sign bits.
45336 if (DAG.ComputeNumSignBits(Match) != BitWidth)
45337 return SDValue();
45338
45339 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45340 SDValue Lo, Hi;
45341 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45342 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45343 MatchSizeInBits = Match.getValueSizeInBits();
45344 }
45345
45346 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45347 MVT MaskSrcVT;
45348 if (64 == BitWidth || 32 == BitWidth)
45349 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
45350 MatchSizeInBits / BitWidth);
45351 else
45352 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45353
45354 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45355 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45356 NumElts = MaskSrcVT.getVectorNumElements();
45357 }
45358 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__
__PRETTY_FUNCTION__))
45359 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__
__PRETTY_FUNCTION__))
;
45360
45361 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45362 if (BinOp == ISD::XOR) {
45363 // parity -> (PARITY(MOVMSK X))
45364 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45365 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45366 }
45367
45368 SDValue CmpC;
45369 ISD::CondCode CondCode;
45370 if (BinOp == ISD::OR) {
45371 // any_of -> MOVMSK != 0
45372 CmpC = DAG.getConstant(0, DL, CmpVT);
45373 CondCode = ISD::CondCode::SETNE;
45374 } else {
45375 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45376 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45377 DL, CmpVT);
45378 CondCode = ISD::CondCode::SETEQ;
45379 }
45380
45381 // The setcc produces an i8 of 0/1, so extend that to the result width and
45382 // negate to get the final 0/-1 mask value.
45383 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45384 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45385 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45386 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
45387 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
45388}
45389
45390static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
45391 const X86Subtarget &Subtarget) {
45392 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45393 return SDValue();
45394
45395 EVT ExtractVT = Extract->getValueType(0);
45396 // Verify the type we're extracting is i32, as the output element type of
45397 // vpdpbusd is i32.
45398 if (ExtractVT != MVT::i32)
45399 return SDValue();
45400
45401 EVT VT = Extract->getOperand(0).getValueType();
45402 if (!isPowerOf2_32(VT.getVectorNumElements()))
45403 return SDValue();
45404
45405 // Match shuffle + add pyramid.
45406 ISD::NodeType BinOp;
45407 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45408
45409 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45410 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45411 // before adding into the accumulator.
45412 // TODO:
45413 // We also need to verify that the multiply has at least 2x the number of bits
45414 // of the input. We shouldn't match
45415 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45416 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45417 // Root = Root.getOperand(0);
45418
45419 // If there was a match, we want Root to be a mul.
45420 if (!Root || Root.getOpcode() != ISD::MUL)
45421 return SDValue();
45422
45423 // Check whether we have an extend and mul pattern
45424 SDValue LHS, RHS;
45425 if (!detectExtMul(DAG, Root, LHS, RHS))
45426 return SDValue();
45427
45428 // Create the dot product instruction.
45429 SDLoc DL(Extract);
45430 unsigned StageBias;
45431 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45432
45433 // If the original vector was wider than 4 elements, sum over the results
45434 // in the DP vector.
45435 unsigned Stages = Log2_32(VT.getVectorNumElements());
45436 EVT DpVT = DP.getValueType();
45437
45438 if (Stages > StageBias) {
45439 unsigned DpElems = DpVT.getVectorNumElements();
45440
45441 for (unsigned i = Stages - StageBias; i > 0; --i) {
45442 SmallVector<int, 16> Mask(DpElems, -1);
45443 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45444 Mask[j] = MaskEnd + j;
45445
45446 SDValue Shuffle =
45447 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45448 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45449 }
45450 }
45451
45452 // Return the lowest ExtractSizeInBits bits.
45453 EVT ResVT =
45454 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45455 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45456 DP = DAG.getBitcast(ResVT, DP);
45457 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45458 Extract->getOperand(1));
45459}
45460
45461static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
45462 const X86Subtarget &Subtarget) {
45463 // PSADBW is only supported on SSE2 and up.
45464 if (!Subtarget.hasSSE2())
45465 return SDValue();
45466
45467 EVT ExtractVT = Extract->getValueType(0);
45468 // Verify the type we're extracting is either i32 or i64.
45469 // FIXME: Could support other types, but this is what we have coverage for.
45470 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45471 return SDValue();
45472
45473 EVT VT = Extract->getOperand(0).getValueType();
45474 if (!isPowerOf2_32(VT.getVectorNumElements()))
45475 return SDValue();
45476
45477 // Match shuffle + add pyramid.
45478 ISD::NodeType BinOp;
45479 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45480
45481 // The operand is expected to be zero extended from i8
45482 // (verified in detectZextAbsDiff).
45483 // In order to convert to i64 and above, additional any/zero/sign
45484 // extend is expected.
45485 // The zero extend from 32 bit has no mathematical effect on the result.
45486 // Also the sign extend is basically zero extend
45487 // (extends the sign bit which is zero).
45488 // So it is correct to skip the sign/zero extend instruction.
45489 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
45490 Root.getOpcode() == ISD::ZERO_EXTEND ||
45491 Root.getOpcode() == ISD::ANY_EXTEND))
45492 Root = Root.getOperand(0);
45493
45494 // If there was a match, we want Root to be a select that is the root of an
45495 // abs-diff pattern.
45496 if (!Root || Root.getOpcode() != ISD::ABS)
45497 return SDValue();
45498
45499 // Check whether we have an abs-diff pattern feeding into the select.
45500 SDValue Zext0, Zext1;
45501 if (!detectZextAbsDiff(Root, Zext0, Zext1))
45502 return SDValue();
45503
45504 // Create the SAD instruction.
45505 SDLoc DL(Extract);
45506 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45507
45508 // If the original vector was wider than 8 elements, sum over the results
45509 // in the SAD vector.
45510 unsigned Stages = Log2_32(VT.getVectorNumElements());
45511 EVT SadVT = SAD.getValueType();
45512 if (Stages > 3) {
45513 unsigned SadElems = SadVT.getVectorNumElements();
45514
45515 for(unsigned i = Stages - 3; i > 0; --i) {
45516 SmallVector<int, 16> Mask(SadElems, -1);
45517 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45518 Mask[j] = MaskEnd + j;
45519
45520 SDValue Shuffle =
45521 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45522 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45523 }
45524 }
45525
45526 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45527 // Return the lowest ExtractSizeInBits bits.
45528 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45529 SadVT.getSizeInBits() / ExtractSizeInBits);
45530 SAD = DAG.getBitcast(ResVT, SAD);
45531 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45532 Extract->getOperand(1));
45533}
45534
45535// Attempt to peek through a target shuffle and extract the scalar from the
45536// source.
45537static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
45538 TargetLowering::DAGCombinerInfo &DCI,
45539 const X86Subtarget &Subtarget) {
45540 if (DCI.isBeforeLegalizeOps())
45541 return SDValue();
45542
45543 SDLoc dl(N);
45544 SDValue Src = N->getOperand(0);
45545 SDValue Idx = N->getOperand(1);
45546
45547 EVT VT = N->getValueType(0);
45548 EVT SrcVT = Src.getValueType();
45549 EVT SrcSVT = SrcVT.getVectorElementType();
45550 unsigned SrcEltBits = SrcSVT.getSizeInBits();
45551 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45552
45553 // Don't attempt this for boolean mask vectors or unknown extraction indices.
45554 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45555 return SDValue();
45556
45557 const APInt &IdxC = N->getConstantOperandAPInt(1);
45558 if (IdxC.uge(NumSrcElts))
45559 return SDValue();
45560
45561 SDValue SrcBC = peekThroughBitcasts(Src);
45562
45563 // Handle extract(bitcast(broadcast(scalar_value))).
45564 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45565 SDValue SrcOp = SrcBC.getOperand(0);
45566 EVT SrcOpVT = SrcOp.getValueType();
45567 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45568 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45569 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45570 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45571 // TODO support non-zero offsets.
45572 if (Offset == 0) {
45573 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45574 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45575 return SrcOp;
45576 }
45577 }
45578 }
45579
45580 // If we're extracting a single element from a broadcast load and there are
45581 // no other users, just create a single load.
45582 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45583 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45584 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45585 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45586 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45587 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45588 MemIntr->getBasePtr(),
45589 MemIntr->getPointerInfo(),
45590 MemIntr->getOriginalAlign(),
45591 MemIntr->getMemOperand()->getFlags());
45592 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45593 return Load;
45594 }
45595 }
45596
45597 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45598 // TODO: Move to DAGCombine?
45599 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45600 SrcBC.getValueType().isInteger() &&
45601 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45602 SrcBC.getScalarValueSizeInBits() ==
45603 SrcBC.getOperand(0).getValueSizeInBits()) {
45604 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45605 if (IdxC.ult(Scale)) {
45606 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45607 SDValue Scl = SrcBC.getOperand(0);
45608 EVT SclVT = Scl.getValueType();
45609 if (Offset) {
45610 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
45611 DAG.getShiftAmountConstant(Offset, SclVT, dl));
45612 }
45613 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
45614 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
45615 return Scl;
45616 }
45617 }
45618
45619 // Handle extract(truncate(x)) for 0'th index.
45620 // TODO: Treat this as a faux shuffle?
45621 // TODO: When can we use this for general indices?
45622 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
45623 (SrcVT.getSizeInBits() % 128) == 0) {
45624 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
45625 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
45626 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45627 Idx);
45628 }
45629
45630 // We can only legally extract other elements from 128-bit vectors and in
45631 // certain circumstances, depending on SSE-level.
45632 // TODO: Investigate float/double extraction if it will be just stored.
45633 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
45634 unsigned Idx) {
45635 EVT VecSVT = VecVT.getScalarType();
45636 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
45637 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
45638 VecSVT == MVT::i64)) {
45639 unsigned EltSizeInBits = VecSVT.getSizeInBits();
45640 unsigned NumEltsPerLane = 128 / EltSizeInBits;
45641 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45642 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
45643 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
45644 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
45645 Idx &= (NumEltsPerLane - 1);
45646 }
45647 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
45648 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
45649 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
45650 DAG.getBitcast(VecVT, Vec),
45651 DAG.getIntPtrConstant(Idx, dl));
45652 }
45653 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
45654 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
45655 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
45656 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
45657 DAG.getTargetConstant(Idx, dl, MVT::i8));
45658 }
45659 return SDValue();
45660 };
45661
45662 // Resolve the target shuffle inputs and mask.
45663 SmallVector<int, 16> Mask;
45664 SmallVector<SDValue, 2> Ops;
45665 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
45666 return SDValue();
45667
45668 // Shuffle inputs must be the same size as the result.
45669 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
45670 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
45671 }))
45672 return SDValue();
45673
45674 // Attempt to narrow/widen the shuffle mask to the correct size.
45675 if (Mask.size() != NumSrcElts) {
45676 if ((NumSrcElts % Mask.size()) == 0) {
45677 SmallVector<int, 16> ScaledMask;
45678 int Scale = NumSrcElts / Mask.size();
45679 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
45680 Mask = std::move(ScaledMask);
45681 } else if ((Mask.size() % NumSrcElts) == 0) {
45682 // Simplify Mask based on demanded element.
45683 int ExtractIdx = (int)IdxC.getZExtValue();
45684 int Scale = Mask.size() / NumSrcElts;
45685 int Lo = Scale * ExtractIdx;
45686 int Hi = Scale * (ExtractIdx + 1);
45687 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
45688 if (i < Lo || Hi <= i)
45689 Mask[i] = SM_SentinelUndef;
45690
45691 SmallVector<int, 16> WidenedMask;
45692 while (Mask.size() > NumSrcElts &&
45693 canWidenShuffleElements(Mask, WidenedMask))
45694 Mask = std::move(WidenedMask);
45695 }
45696 }
45697
45698 // If narrowing/widening failed, see if we can extract+zero-extend.
45699 int ExtractIdx;
45700 EVT ExtractVT;
45701 if (Mask.size() == NumSrcElts) {
45702 ExtractIdx = Mask[IdxC.getZExtValue()];
45703 ExtractVT = SrcVT;
45704 } else {
45705 unsigned Scale = Mask.size() / NumSrcElts;
45706 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
45707 return SDValue();
45708 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
45709 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
45710 return SDValue();
45711 ExtractIdx = Mask[ScaledIdx];
45712 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
45713 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
45714 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__
__PRETTY_FUNCTION__))
45715 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__
__PRETTY_FUNCTION__))
;
45716 }
45717
45718 // If the shuffle source element is undef/zero then we can just accept it.
45719 if (ExtractIdx == SM_SentinelUndef)
45720 return DAG.getUNDEF(VT);
45721
45722 if (ExtractIdx == SM_SentinelZero)
45723 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
45724 : DAG.getConstant(0, dl, VT);
45725
45726 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
45727 ExtractIdx = ExtractIdx % Mask.size();
45728 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
45729 return DAG.getZExtOrTrunc(V, dl, VT);
45730
45731 return SDValue();
45732}
45733
45734/// Extracting a scalar FP value from vector element 0 is free, so extract each
45735/// operand first, then perform the math as a scalar op.
45736static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
45737 const X86Subtarget &Subtarget) {
45738 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45738, __extension__
__PRETTY_FUNCTION__))
;
45739 SDValue Vec = ExtElt->getOperand(0);
45740 SDValue Index = ExtElt->getOperand(1);
45741 EVT VT = ExtElt->getValueType(0);
45742 EVT VecVT = Vec.getValueType();
45743
45744 // TODO: If this is a unary/expensive/expand op, allow extraction from a
45745 // non-zero element because the shuffle+scalar op will be cheaper?
45746 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
45747 return SDValue();
45748
45749 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
45750 // extract, the condition code), so deal with those as a special-case.
45751 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
45752 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
45753 if (OpVT != MVT::f32 && OpVT != MVT::f64)
45754 return SDValue();
45755
45756 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
45757 SDLoc DL(ExtElt);
45758 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45759 Vec.getOperand(0), Index);
45760 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45761 Vec.getOperand(1), Index);
45762 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
45763 }
45764
45765 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
45766 VT != MVT::f64)
45767 return SDValue();
45768
45769 // Vector FP selects don't fit the pattern of FP math ops (because the
45770 // condition has a different type and we have to change the opcode), so deal
45771 // with those here.
45772 // FIXME: This is restricted to pre type legalization by ensuring the setcc
45773 // has i1 elements. If we loosen this we need to convert vector bool to a
45774 // scalar bool.
45775 if (Vec.getOpcode() == ISD::VSELECT &&
45776 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
45777 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
45778 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
45779 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
45780 SDLoc DL(ExtElt);
45781 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
45782 Vec.getOperand(0).getValueType().getScalarType(),
45783 Vec.getOperand(0), Index);
45784 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45785 Vec.getOperand(1), Index);
45786 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45787 Vec.getOperand(2), Index);
45788 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
45789 }
45790
45791 // TODO: This switch could include FNEG and the x86-specific FP logic ops
45792 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
45793 // missed load folding and fma+fneg combining.
45794 switch (Vec.getOpcode()) {
45795 case ISD::FMA: // Begin 3 operands
45796 case ISD::FMAD:
45797 case ISD::FADD: // Begin 2 operands
45798 case ISD::FSUB:
45799 case ISD::FMUL:
45800 case ISD::FDIV:
45801 case ISD::FREM:
45802 case ISD::FCOPYSIGN:
45803 case ISD::FMINNUM:
45804 case ISD::FMAXNUM:
45805 case ISD::FMINNUM_IEEE:
45806 case ISD::FMAXNUM_IEEE:
45807 case ISD::FMAXIMUM:
45808 case ISD::FMINIMUM:
45809 case X86ISD::FMAX:
45810 case X86ISD::FMIN:
45811 case ISD::FABS: // Begin 1 operand
45812 case ISD::FSQRT:
45813 case ISD::FRINT:
45814 case ISD::FCEIL:
45815 case ISD::FTRUNC:
45816 case ISD::FNEARBYINT:
45817 case ISD::FROUND:
45818 case ISD::FFLOOR:
45819 case X86ISD::FRCP:
45820 case X86ISD::FRSQRT: {
45821 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
45822 SDLoc DL(ExtElt);
45823 SmallVector<SDValue, 4> ExtOps;
45824 for (SDValue Op : Vec->ops())
45825 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
45826 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
45827 }
45828 default:
45829 return SDValue();
45830 }
45831 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45831)
;
45832}
45833
45834/// Try to convert a vector reduction sequence composed of binops and shuffles
45835/// into horizontal ops.
45836static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
45837 const X86Subtarget &Subtarget) {
45838 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45838, __extension__
__PRETTY_FUNCTION__))
;
45839
45840 // We need at least SSE2 to anything here.
45841 if (!Subtarget.hasSSE2())
45842 return SDValue();
45843
45844 ISD::NodeType Opc;
45845 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
45846 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
45847 if (!Rdx)
45848 return SDValue();
45849
45850 SDValue Index = ExtElt->getOperand(1);
45851 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__
__PRETTY_FUNCTION__))
45852 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__
__PRETTY_FUNCTION__))
;
45853
45854 EVT VT = ExtElt->getValueType(0);
45855 EVT VecVT = Rdx.getValueType();
45856 if (VecVT.getScalarType() != VT)
45857 return SDValue();
45858
45859 SDLoc DL(ExtElt);
45860 unsigned NumElts = VecVT.getVectorNumElements();
45861 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
45862
45863 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
45864 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
45865 if (V.getValueType() == MVT::v4i8) {
45866 if (ZeroExtend && Subtarget.hasSSE41()) {
45867 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
45868 DAG.getConstant(0, DL, MVT::v4i32),
45869 DAG.getBitcast(MVT::i32, V),
45870 DAG.getIntPtrConstant(0, DL));
45871 return DAG.getBitcast(MVT::v16i8, V);
45872 }
45873 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
45874 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
45875 : DAG.getUNDEF(MVT::v4i8));
45876 }
45877 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
45878 DAG.getUNDEF(MVT::v8i8));
45879 };
45880
45881 // vXi8 mul reduction - promote to vXi16 mul reduction.
45882 if (Opc == ISD::MUL) {
45883 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
45884 return SDValue();
45885 if (VecVT.getSizeInBits() >= 128) {
45886 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
45887 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45888 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45889 Lo = DAG.getBitcast(WideVT, Lo);
45890 Hi = DAG.getBitcast(WideVT, Hi);
45891 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
45892 while (Rdx.getValueSizeInBits() > 128) {
45893 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45894 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
45895 }
45896 } else {
45897 Rdx = WidenToV16I8(Rdx, false);
45898 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
45899 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
45900 }
45901 if (NumElts >= 8)
45902 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45903 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45904 {4, 5, 6, 7, -1, -1, -1, -1}));
45905 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45906 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45907 {2, 3, -1, -1, -1, -1, -1, -1}));
45908 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45909 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45910 {1, -1, -1, -1, -1, -1, -1, -1}));
45911 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45912 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45913 }
45914
45915 // vXi8 add reduction - sub 128-bit vector.
45916 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
45917 Rdx = WidenToV16I8(Rdx, true);
45918 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45919 DAG.getConstant(0, DL, MVT::v16i8));
45920 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45921 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45922 }
45923
45924 // Must be a >=128-bit vector with pow2 elements.
45925 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
45926 return SDValue();
45927
45928 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
45929 if (VT == MVT::i8) {
45930 while (Rdx.getValueSizeInBits() > 128) {
45931 SDValue Lo, Hi;
45932 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45933 VecVT = Lo.getValueType();
45934 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45935 }
45936 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45936, __extension__
__PRETTY_FUNCTION__))
;
45937
45938 SDValue Hi = DAG.getVectorShuffle(
45939 MVT::v16i8, DL, Rdx, Rdx,
45940 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
45941 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
45942 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45943 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
45944 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45946 }
45947
45948 // See if we can use vXi8 PSADBW add reduction for larger zext types.
45949 // If the source vector values are 0-255, then we can use PSADBW to
45950 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
45951 // TODO: See if its worth avoiding vXi16/i32 truncations?
45952 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
45953 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
45954 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
45955 Subtarget.hasAVX512())) {
45956 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
45957 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
45958 if (ByteVT.getSizeInBits() < 128)
45959 Rdx = WidenToV16I8(Rdx, true);
45960
45961 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45962 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45963 ArrayRef<SDValue> Ops) {
45964 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45965 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
45966 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
45967 };
45968 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
45969 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
45970
45971 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
45972 while (Rdx.getValueSizeInBits() > 128) {
45973 SDValue Lo, Hi;
45974 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45975 VecVT = Lo.getValueType();
45976 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45977 }
45978 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45978, __extension__
__PRETTY_FUNCTION__))
;
45979
45980 if (NumElts > 8) {
45981 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
45982 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
45983 }
45984
45985 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
45986 Rdx = DAG.getBitcast(VecVT, Rdx);
45987 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45988 }
45989
45990 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
45991 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
45992 return SDValue();
45993
45994 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
45995
45996 // 256-bit horizontal instructions operate on 128-bit chunks rather than
45997 // across the whole vector, so we need an extract + hop preliminary stage.
45998 // This is the only step where the operands of the hop are not the same value.
45999 // TODO: We could extend this to handle 512-bit or even longer vectors.
46000 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
46001 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
46002 unsigned NumElts = VecVT.getVectorNumElements();
46003 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
46004 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
46005 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
46006 VecVT = Rdx.getValueType();
46007 }
46008 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
46009 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
46010 return SDValue();
46011
46012 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46013 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
46014 for (unsigned i = 0; i != ReductionSteps; ++i)
46015 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
46016
46017 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46018}
46019
46020/// Detect vector gather/scatter index generation and convert it from being a
46021/// bunch of shuffles and extracts into a somewhat faster sequence.
46022/// For i686, the best sequence is apparently storing the value and loading
46023/// scalars back, while for x64 we should use 64-bit extracts and shifts.
46024static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
46025 TargetLowering::DAGCombinerInfo &DCI,
46026 const X86Subtarget &Subtarget) {
46027 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
46028 return NewOp;
46029
46030 SDValue InputVector = N->getOperand(0);
46031 SDValue EltIdx = N->getOperand(1);
46032 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
46033
46034 EVT SrcVT = InputVector.getValueType();
46035 EVT VT = N->getValueType(0);
46036 SDLoc dl(InputVector);
46037 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46038 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46039 unsigned NumEltBits = VT.getScalarSizeInBits();
46040 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46041
46042 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46043 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46044
46045 // Integer Constant Folding.
46046 if (CIdx && VT.isInteger()) {
46047 APInt UndefVecElts;
46048 SmallVector<APInt, 16> EltBits;
46049 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
46050 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
46051 EltBits, true, false)) {
46052 uint64_t Idx = CIdx->getZExtValue();
46053 if (UndefVecElts[Idx])
46054 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46055 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
46056 }
46057
46058 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46059 // Improves lowering of bool masks on rust which splits them into byte array.
46060 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
46061 SDValue Src = peekThroughBitcasts(InputVector);
46062 if (Src.getValueType().getScalarType() == MVT::i1 &&
46063 TLI.isTypeLegal(Src.getValueType())) {
46064 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
46065 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
46066 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
46067 return DAG.getBitcast(VT, Sub);
46068 }
46069 }
46070 }
46071
46072 if (IsPextr) {
46073 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
46074 DCI))
46075 return SDValue(N, 0);
46076
46077 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46078 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
46079 InputVector.getOpcode() == X86ISD::PINSRW) &&
46080 InputVector.getOperand(2) == EltIdx) {
46081 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__
__PRETTY_FUNCTION__))
46082 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__
__PRETTY_FUNCTION__))
;
46083 SDValue Scl = InputVector.getOperand(1);
46084 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
46085 return DAG.getZExtOrTrunc(Scl, dl, VT);
46086 }
46087
46088 // TODO - Remove this once we can handle the implicit zero-extension of
46089 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
46090 // combineBasicSADPattern.
46091 return SDValue();
46092 }
46093
46094 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
46095 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
46096 InputVector.getOpcode() == ISD::BITCAST &&
46097 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46098 isNullConstant(EltIdx) && InputVector.hasOneUse())
46099 return DAG.getBitcast(VT, InputVector);
46100
46101 // Detect mmx to i32 conversion through a v2i32 elt extract.
46102 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
46103 InputVector.getOpcode() == ISD::BITCAST &&
46104 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46105 isNullConstant(EltIdx) && InputVector.hasOneUse())
46106 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
46107 InputVector.getOperand(0));
46108
46109 // Check whether this extract is the root of a sum of absolute differences
46110 // pattern. This has to be done here because we really want it to happen
46111 // pre-legalization,
46112 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
46113 return SAD;
46114
46115 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
46116 return VPDPBUSD;
46117
46118 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
46119 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
46120 return Cmp;
46121
46122 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
46123 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
46124 return MinMax;
46125
46126 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
46127 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
46128 return V;
46129
46130 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
46131 return V;
46132
46133 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
46134 // and then testing the relevant element.
46135 //
46136 // Note that we only combine extracts on the *same* result number, i.e.
46137 // t0 = merge_values a0, a1, a2, a3
46138 // i1 = extract_vector_elt t0, Constant:i64<2>
46139 // i1 = extract_vector_elt t0, Constant:i64<3>
46140 // but not
46141 // i1 = extract_vector_elt t0:1, Constant:i64<2>
46142 // since the latter would need its own MOVMSK.
46143 if (SrcVT.getScalarType() == MVT::i1) {
46144 bool IsVar = !CIdx;
46145 SmallVector<SDNode *, 16> BoolExtracts;
46146 unsigned ResNo = InputVector.getResNo();
46147 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
46148 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46149 Use->getOperand(0).getResNo() == ResNo &&
46150 Use->getValueType(0) == MVT::i1) {
46151 BoolExtracts.push_back(Use);
46152 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46153 return true;
46154 }
46155 return false;
46156 };
46157 // TODO: Can we drop the oneuse check for constant extracts?
46158 if (all_of(InputVector->uses(), IsBoolExtract) &&
46159 (IsVar || BoolExtracts.size() > 1)) {
46160 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
46161 if (SDValue BC =
46162 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
46163 for (SDNode *Use : BoolExtracts) {
46164 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46165 // Mask = 1 << MaskIdx
46166 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46167 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
46168 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
46169 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
46170 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
46171 DCI.CombineTo(Use, Res);
46172 }
46173 return SDValue(N, 0);
46174 }
46175 }
46176 }
46177
46178 // If this extract is from a loaded vector value and will be used as an
46179 // integer, that requires a potentially expensive XMM -> GPR transfer.
46180 // Additionally, if we can convert to a scalar integer load, that will likely
46181 // be folded into a subsequent integer op.
46182 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
46183 // to a single-use of the loaded vector. For the reasons above, we
46184 // expect this to be profitable even if it creates an extra load.
46185 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
46186 return Use->getOpcode() == ISD::STORE ||
46187 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46188 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46189 });
46190 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
46191 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46192 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
46193 !LikelyUsedAsVector && LoadVec->isSimple()) {
46194 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46195 SDValue NewPtr =
46196 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
46197 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
46198 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46199 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46200 SDValue Load =
46201 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46202 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46203 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46204 return Load;
46205 }
46206
46207 return SDValue();
46208}
46209
46210// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
46211// This is more or less the reverse of combineBitcastvxi1.
46212static SDValue combineToExtendBoolVectorInReg(
46213 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
46214 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
46215 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
46216 Opcode != ISD::ANY_EXTEND)
46217 return SDValue();
46218 if (!DCI.isBeforeLegalizeOps())
46219 return SDValue();
46220 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46221 return SDValue();
46222
46223 EVT SVT = VT.getScalarType();
46224 EVT InSVT = N0.getValueType().getScalarType();
46225 unsigned EltSizeInBits = SVT.getSizeInBits();
46226
46227 // Input type must be extending a bool vector (bit-casted from a scalar
46228 // integer) to legal integer types.
46229 if (!VT.isVector())
46230 return SDValue();
46231 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
46232 return SDValue();
46233 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
46234 return SDValue();
46235
46236 SDValue N00 = N0.getOperand(0);
46237 EVT SclVT = N00.getValueType();
46238 if (!SclVT.isScalarInteger())
46239 return SDValue();
46240
46241 SDValue Vec;
46242 SmallVector<int> ShuffleMask;
46243 unsigned NumElts = VT.getVectorNumElements();
46244 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46244, __extension__
__PRETTY_FUNCTION__))
;
46245
46246 // Broadcast the scalar integer to the vector elements.
46247 if (NumElts > EltSizeInBits) {
46248 // If the scalar integer is greater than the vector element size, then we
46249 // must split it down into sub-sections for broadcasting. For example:
46250 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46251 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46252 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46252, __extension__
__PRETTY_FUNCTION__))
;
46253 unsigned Scale = NumElts / EltSizeInBits;
46254 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46255 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46256 Vec = DAG.getBitcast(VT, Vec);
46257
46258 for (unsigned i = 0; i != Scale; ++i)
46259 ShuffleMask.append(EltSizeInBits, i);
46260 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46261 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46262 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46263 // If we have register broadcast instructions, use the scalar size as the
46264 // element type for the shuffle. Then cast to the wider element type. The
46265 // widened bits won't be used, and this might allow the use of a broadcast
46266 // load.
46267 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46267, __extension__
__PRETTY_FUNCTION__))
;
46268 unsigned Scale = EltSizeInBits / NumElts;
46269 EVT BroadcastVT =
46270 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
46271 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46272 ShuffleMask.append(NumElts * Scale, 0);
46273 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
46274 Vec = DAG.getBitcast(VT, Vec);
46275 } else {
46276 // For smaller scalar integers, we can simply any-extend it to the vector
46277 // element size (we don't care about the upper bits) and broadcast it to all
46278 // elements.
46279 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
46280 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
46281 ShuffleMask.append(NumElts, 0);
46282 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46283 }
46284
46285 // Now, mask the relevant bit in each element.
46286 SmallVector<SDValue, 32> Bits;
46287 for (unsigned i = 0; i != NumElts; ++i) {
46288 int BitIdx = (i % EltSizeInBits);
46289 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46290 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46291 }
46292 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46293 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46294
46295 // Compare against the bitmask and extend the result.
46296 EVT CCVT = VT.changeVectorElementType(MVT::i1);
46297 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46298 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46299
46300 // For SEXT, this is now done, otherwise shift the result down for
46301 // zero-extension.
46302 if (Opcode == ISD::SIGN_EXTEND)
46303 return Vec;
46304 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46305 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46306}
46307
46308/// If a vector select has an operand that is -1 or 0, try to simplify the
46309/// select to a bitwise logic operation.
46310/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46311static SDValue
46312combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
46313 TargetLowering::DAGCombinerInfo &DCI,
46314 const X86Subtarget &Subtarget) {
46315 SDValue Cond = N->getOperand(0);
46316 SDValue LHS = N->getOperand(1);
46317 SDValue RHS = N->getOperand(2);
46318 EVT VT = LHS.getValueType();
46319 EVT CondVT = Cond.getValueType();
46320 SDLoc DL(N);
46321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46322
46323 if (N->getOpcode() != ISD::VSELECT)
46324 return SDValue();
46325
46326 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46326, __extension__
__PRETTY_FUNCTION__))
;
46327
46328 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46329 // TODO: Can we assert that both operands are not zeros (because that should
46330 // get simplified at node creation time)?
46331 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46332 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46333
46334 // If both inputs are 0/undef, create a complete zero vector.
46335 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46336 if (TValIsAllZeros && FValIsAllZeros) {
46337 if (VT.isFloatingPoint())
46338 return DAG.getConstantFP(0.0, DL, VT);
46339 return DAG.getConstant(0, DL, VT);
46340 }
46341
46342 // To use the condition operand as a bitwise mask, it must have elements that
46343 // are the same size as the select elements. Ie, the condition operand must
46344 // have already been promoted from the IR select condition type <N x i1>.
46345 // Don't check if the types themselves are equal because that excludes
46346 // vector floating-point selects.
46347 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46348 return SDValue();
46349
46350 // Try to invert the condition if true value is not all 1s and false value is
46351 // not all 0s. Only do this if the condition has one use.
46352 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46353 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46354 // Check if the selector will be produced by CMPP*/PCMP*.
46355 Cond.getOpcode() == ISD::SETCC &&
46356 // Check if SETCC has already been promoted.
46357 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46358 CondVT) {
46359 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46360
46361 if (TValIsAllZeros || FValIsAllOnes) {
46362 SDValue CC = Cond.getOperand(2);
46363 ISD::CondCode NewCC = ISD::getSetCCInverse(
46364 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46365 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46366 NewCC);
46367 std::swap(LHS, RHS);
46368 TValIsAllOnes = FValIsAllOnes;
46369 FValIsAllZeros = TValIsAllZeros;
46370 }
46371 }
46372
46373 // Cond value must be 'sign splat' to be converted to a logical op.
46374 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46375 return SDValue();
46376
46377 // vselect Cond, 111..., 000... -> Cond
46378 if (TValIsAllOnes && FValIsAllZeros)
46379 return DAG.getBitcast(VT, Cond);
46380
46381 if (!TLI.isTypeLegal(CondVT))
46382 return SDValue();
46383
46384 // vselect Cond, 111..., X -> or Cond, X
46385 if (TValIsAllOnes) {
46386 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46387 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46388 return DAG.getBitcast(VT, Or);
46389 }
46390
46391 // vselect Cond, X, 000... -> and Cond, X
46392 if (FValIsAllZeros) {
46393 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46394 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46395 return DAG.getBitcast(VT, And);
46396 }
46397
46398 // vselect Cond, 000..., X -> andn Cond, X
46399 if (TValIsAllZeros) {
46400 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46401 SDValue AndN;
46402 // The canonical form differs for i1 vectors - x86andnp is not used
46403 if (CondVT.getScalarType() == MVT::i1)
46404 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46405 CastRHS);
46406 else
46407 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46408 return DAG.getBitcast(VT, AndN);
46409 }
46410
46411 return SDValue();
46412}
46413
46414/// If both arms of a vector select are concatenated vectors, split the select,
46415/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46416/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46417/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46418static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
46419 const X86Subtarget &Subtarget) {
46420 unsigned Opcode = N->getOpcode();
46421 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46422 return SDValue();
46423
46424 // TODO: Split 512-bit vectors too?
46425 EVT VT = N->getValueType(0);
46426 if (!VT.is256BitVector())
46427 return SDValue();
46428
46429 // TODO: Split as long as any 2 of the 3 operands are concatenated?
46430 SDValue Cond = N->getOperand(0);
46431 SDValue TVal = N->getOperand(1);
46432 SDValue FVal = N->getOperand(2);
46433 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
46434 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46435 !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
46436 !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
46437 return SDValue();
46438
46439 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46440 ArrayRef<SDValue> Ops) {
46441 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46442 };
46443 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
46444 makeBlend, /*CheckBWI*/ false);
46445}
46446
46447static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
46448 SDValue Cond = N->getOperand(0);
46449 SDValue LHS = N->getOperand(1);
46450 SDValue RHS = N->getOperand(2);
46451 SDLoc DL(N);
46452
46453 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46454 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46455 if (!TrueC || !FalseC)
46456 return SDValue();
46457
46458 // Don't do this for crazy integer types.
46459 EVT VT = N->getValueType(0);
46460 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46461 return SDValue();
46462
46463 // We're going to use the condition bit in math or logic ops. We could allow
46464 // this with a wider condition value (post-legalization it becomes an i8),
46465 // but if nothing is creating selects that late, it doesn't matter.
46466 if (Cond.getValueType() != MVT::i1)
46467 return SDValue();
46468
46469 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46470 // 3, 5, or 9 with i32/i64, so those get transformed too.
46471 // TODO: For constants that overflow or do not differ by power-of-2 or small
46472 // multiplier, convert to 'and' + 'add'.
46473 const APInt &TrueVal = TrueC->getAPIntValue();
46474 const APInt &FalseVal = FalseC->getAPIntValue();
46475
46476 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46477 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46478 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46479 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46480 if (CC == ISD::SETEQ || CC == ISD::SETNE)
46481 return SDValue();
46482 }
46483
46484 bool OV;
46485 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46486 if (OV)
46487 return SDValue();
46488
46489 APInt AbsDiff = Diff.abs();
46490 if (AbsDiff.isPowerOf2() ||
46491 ((VT == MVT::i32 || VT == MVT::i64) &&
46492 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46493
46494 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46495 // of the condition can usually be folded into a compare predicate, but even
46496 // without that, the sequence should be cheaper than a CMOV alternative.
46497 if (TrueVal.slt(FalseVal)) {
46498 Cond = DAG.getNOT(DL, Cond, MVT::i1);
46499 std::swap(TrueC, FalseC);
46500 }
46501
46502 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46503 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46504
46505 // Multiply condition by the difference if non-one.
46506 if (!AbsDiff.isOne())
46507 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46508
46509 // Add the base if non-zero.
46510 if (!FalseC->isZero())
46511 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46512
46513 return R;
46514 }
46515
46516 return SDValue();
46517}
46518
46519/// If this is a *dynamic* select (non-constant condition) and we can match
46520/// this node with one of the variable blend instructions, restructure the
46521/// condition so that blends can use the high (sign) bit of each element.
46522/// This function will also call SimplifyDemandedBits on already created
46523/// BLENDV to perform additional simplifications.
46524static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
46525 TargetLowering::DAGCombinerInfo &DCI,
46526 const X86Subtarget &Subtarget) {
46527 SDValue Cond = N->getOperand(0);
46528 if ((N->getOpcode() != ISD::VSELECT &&
46529 N->getOpcode() != X86ISD::BLENDV) ||
46530 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
46531 return SDValue();
46532
46533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46534 unsigned BitWidth = Cond.getScalarValueSizeInBits();
46535 EVT VT = N->getValueType(0);
46536
46537 // We can only handle the cases where VSELECT is directly legal on the
46538 // subtarget. We custom lower VSELECT nodes with constant conditions and
46539 // this makes it hard to see whether a dynamic VSELECT will correctly
46540 // lower, so we both check the operation's status and explicitly handle the
46541 // cases where a *dynamic* blend will fail even though a constant-condition
46542 // blend could be custom lowered.
46543 // FIXME: We should find a better way to handle this class of problems.
46544 // Potentially, we should combine constant-condition vselect nodes
46545 // pre-legalization into shuffles and not mark as many types as custom
46546 // lowered.
46547 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
46548 return SDValue();
46549 // FIXME: We don't support i16-element blends currently. We could and
46550 // should support them by making *all* the bits in the condition be set
46551 // rather than just the high bit and using an i8-element blend.
46552 if (VT.getVectorElementType() == MVT::i16)
46553 return SDValue();
46554 // Dynamic blending was only available from SSE4.1 onward.
46555 if (VT.is128BitVector() && !Subtarget.hasSSE41())
46556 return SDValue();
46557 // Byte blends are only available in AVX2
46558 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46559 return SDValue();
46560 // There are no 512-bit blend instructions that use sign bits.
46561 if (VT.is512BitVector())
46562 return SDValue();
46563
46564 // Don't optimize before the condition has been transformed to a legal type
46565 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46566 if (BitWidth < 8 || BitWidth > 64)
46567 return SDValue();
46568
46569 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46570 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
46571 UI != UE; ++UI)
46572 if ((UI->getOpcode() != ISD::VSELECT &&
46573 UI->getOpcode() != X86ISD::BLENDV) ||
46574 UI.getOperandNo() != 0)
46575 return false;
46576
46577 return true;
46578 };
46579
46580 APInt DemandedBits(APInt::getSignMask(BitWidth));
46581
46582 if (OnlyUsedAsSelectCond(Cond)) {
46583 KnownBits Known;
46584 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
46585 !DCI.isBeforeLegalizeOps());
46586 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46587 return SDValue();
46588
46589 // If we changed the computation somewhere in the DAG, this change will
46590 // affect all users of Cond. Update all the nodes so that we do not use
46591 // the generic VSELECT anymore. Otherwise, we may perform wrong
46592 // optimizations as we messed with the actual expectation for the vector
46593 // boolean values.
46594 for (SDNode *U : Cond->uses()) {
46595 if (U->getOpcode() == X86ISD::BLENDV)
46596 continue;
46597
46598 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46599 Cond, U->getOperand(1), U->getOperand(2));
46600 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46601 DCI.AddToWorklist(U);
46602 }
46603 DCI.CommitTargetLoweringOpt(TLO);
46604 return SDValue(N, 0);
46605 }
46606
46607 // Otherwise we can still at least try to simplify multiple use bits.
46608 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
46609 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
46610 N->getOperand(1), N->getOperand(2));
46611
46612 return SDValue();
46613}
46614
46615// Try to match:
46616// (or (and (M, (sub 0, X)), (pandn M, X)))
46617// which is a special case of:
46618// (select M, (sub 0, X), X)
46619// Per:
46620// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
46621// We know that, if fNegate is 0 or 1:
46622// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46623//
46624// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
46625// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46626// ( M ? -X : X) == ((X ^ M ) + (M & 1))
46627// This lets us transform our vselect to:
46628// (add (xor X, M), (and M, 1))
46629// And further to:
46630// (sub (xor X, M), M)
46631static SDValue combineLogicBlendIntoConditionalNegate(
46632 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
46633 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46634 EVT MaskVT = Mask.getValueType();
46635 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))
46636 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))
46637 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))
;
46638
46639 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
46640 return SDValue();
46641 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
46642 return SDValue();
46643
46644 auto IsNegV = [](SDNode *N, SDValue V) {
46645 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46646 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46647 };
46648
46649 SDValue V;
46650 if (IsNegV(Y.getNode(), X))
46651 V = X;
46652 else if (IsNegV(X.getNode(), Y))
46653 V = Y;
46654 else
46655 return SDValue();
46656
46657 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
46658 SDValue SubOp2 = Mask;
46659
46660 // If the negate was on the false side of the select, then
46661 // the operands of the SUB need to be swapped. PR 27251.
46662 // This is because the pattern being matched above is
46663 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
46664 // but if the pattern matched was
46665 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
46666 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
46667 // pattern also needs to be a negation of the replacement pattern above.
46668 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
46669 // sub accomplishes the negation of the replacement pattern.
46670 if (V == Y)
46671 std::swap(SubOp1, SubOp2);
46672
46673 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
46674 return DAG.getBitcast(VT, Res);
46675}
46676
46677/// Do target-specific dag combines on SELECT and VSELECT nodes.
46678static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
46679 TargetLowering::DAGCombinerInfo &DCI,
46680 const X86Subtarget &Subtarget) {
46681 SDLoc DL(N);
46682 SDValue Cond = N->getOperand(0);
46683 SDValue LHS = N->getOperand(1);
46684 SDValue RHS = N->getOperand(2);
46685
46686 // Try simplification again because we use this function to optimize
46687 // BLENDV nodes that are not handled by the generic combiner.
46688 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
46689 return V;
46690
46691 EVT VT = LHS.getValueType();
46692 EVT CondVT = Cond.getValueType();
46693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46694 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
46695
46696 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
46697 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
46698 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
46699 if (CondVT.isVector() && CondVT.isInteger() &&
46700 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
46701 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
46702 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
46703 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
46704 DL, DAG, Subtarget))
46705 return V;
46706
46707 // Convert vselects with constant condition into shuffles.
46708 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
46709 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
46710 SmallVector<int, 64> Mask;
46711 if (createShuffleMaskFromVSELECT(Mask, Cond,
46712 N->getOpcode() == X86ISD::BLENDV))
46713 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
46714 }
46715
46716 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
46717 // by forcing the unselected elements to zero.
46718 // TODO: Can we handle more shuffles with this?
46719 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
46720 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
46721 LHS.hasOneUse() && RHS.hasOneUse()) {
46722 MVT SimpleVT = VT.getSimpleVT();
46723 SmallVector<SDValue, 1> LHSOps, RHSOps;
46724 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
46725 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
46726 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
46727 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
46728 int NumElts = VT.getVectorNumElements();
46729 for (int i = 0; i != NumElts; ++i) {
46730 // getConstVector sets negative shuffle mask values as undef, so ensure
46731 // we hardcode SM_SentinelZero values to zero (0x80).
46732 if (CondMask[i] < NumElts) {
46733 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
46734 RHSMask[i] = 0x80;
46735 } else {
46736 LHSMask[i] = 0x80;
46737 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
46738 }
46739 }
46740 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
46741 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
46742 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
46743 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
46744 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
46745 }
46746 }
46747
46748 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
46749 // instructions match the semantics of the common C idiom x<y?x:y but not
46750 // x<=y?x:y, because of how they handle negative zero (which can be
46751 // ignored in unsafe-math mode).
46752 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
46753 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
46754 VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
46755 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
46756 (Subtarget.hasSSE2() ||
46757 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
46758 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46759
46760 unsigned Opcode = 0;
46761 // Check for x CC y ? x : y.
46762 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
46763 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
46764 switch (CC) {
46765 default: break;
46766 case ISD::SETULT:
46767 // Converting this to a min would handle NaNs incorrectly, and swapping
46768 // the operands would cause it to handle comparisons between positive
46769 // and negative zero incorrectly.
46770 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46771 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46772 !(DAG.isKnownNeverZeroFloat(LHS) ||
46773 DAG.isKnownNeverZeroFloat(RHS)))
46774 break;
46775 std::swap(LHS, RHS);
46776 }
46777 Opcode = X86ISD::FMIN;
46778 break;
46779 case ISD::SETOLE:
46780 // Converting this to a min would handle comparisons between positive
46781 // and negative zero incorrectly.
46782 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46783 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46784 break;
46785 Opcode = X86ISD::FMIN;
46786 break;
46787 case ISD::SETULE:
46788 // Converting this to a min would handle both negative zeros and NaNs
46789 // incorrectly, but we can swap the operands to fix both.
46790 std::swap(LHS, RHS);
46791 [[fallthrough]];
46792 case ISD::SETOLT:
46793 case ISD::SETLT:
46794 case ISD::SETLE:
46795 Opcode = X86ISD::FMIN;
46796 break;
46797
46798 case ISD::SETOGE:
46799 // Converting this to a max would handle comparisons between positive
46800 // and negative zero incorrectly.
46801 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46802 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46803 break;
46804 Opcode = X86ISD::FMAX;
46805 break;
46806 case ISD::SETUGT:
46807 // Converting this to a max would handle NaNs incorrectly, and swapping
46808 // the operands would cause it to handle comparisons between positive
46809 // and negative zero incorrectly.
46810 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46811 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46812 !(DAG.isKnownNeverZeroFloat(LHS) ||
46813 DAG.isKnownNeverZeroFloat(RHS)))
46814 break;
46815 std::swap(LHS, RHS);
46816 }
46817 Opcode = X86ISD::FMAX;
46818 break;
46819 case ISD::SETUGE:
46820 // Converting this to a max would handle both negative zeros and NaNs
46821 // incorrectly, but we can swap the operands to fix both.
46822 std::swap(LHS, RHS);
46823 [[fallthrough]];
46824 case ISD::SETOGT:
46825 case ISD::SETGT:
46826 case ISD::SETGE:
46827 Opcode = X86ISD::FMAX;
46828 break;
46829 }
46830 // Check for x CC y ? y : x -- a min/max with reversed arms.
46831 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
46832 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
46833 switch (CC) {
46834 default: break;
46835 case ISD::SETOGE:
46836 // Converting this to a min would handle comparisons between positive
46837 // and negative zero incorrectly, and swapping the operands would
46838 // cause it to handle NaNs incorrectly.
46839 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46840 !(DAG.isKnownNeverZeroFloat(LHS) ||
46841 DAG.isKnownNeverZeroFloat(RHS))) {
46842 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46843 break;
46844 std::swap(LHS, RHS);
46845 }
46846 Opcode = X86ISD::FMIN;
46847 break;
46848 case ISD::SETUGT:
46849 // Converting this to a min would handle NaNs incorrectly.
46850 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46851 break;
46852 Opcode = X86ISD::FMIN;
46853 break;
46854 case ISD::SETUGE:
46855 // Converting this to a min would handle both negative zeros and NaNs
46856 // incorrectly, but we can swap the operands to fix both.
46857 std::swap(LHS, RHS);
46858 [[fallthrough]];
46859 case ISD::SETOGT:
46860 case ISD::SETGT:
46861 case ISD::SETGE:
46862 Opcode = X86ISD::FMIN;
46863 break;
46864
46865 case ISD::SETULT:
46866 // Converting this to a max would handle NaNs incorrectly.
46867 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46868 break;
46869 Opcode = X86ISD::FMAX;
46870 break;
46871 case ISD::SETOLE:
46872 // Converting this to a max would handle comparisons between positive
46873 // and negative zero incorrectly, and swapping the operands would
46874 // cause it to handle NaNs incorrectly.
46875 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46876 !DAG.isKnownNeverZeroFloat(LHS) &&
46877 !DAG.isKnownNeverZeroFloat(RHS)) {
46878 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46879 break;
46880 std::swap(LHS, RHS);
46881 }
46882 Opcode = X86ISD::FMAX;
46883 break;
46884 case ISD::SETULE:
46885 // Converting this to a max would handle both negative zeros and NaNs
46886 // incorrectly, but we can swap the operands to fix both.
46887 std::swap(LHS, RHS);
46888 [[fallthrough]];
46889 case ISD::SETOLT:
46890 case ISD::SETLT:
46891 case ISD::SETLE:
46892 Opcode = X86ISD::FMAX;
46893 break;
46894 }
46895 }
46896
46897 if (Opcode)
46898 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
46899 }
46900
46901 // Some mask scalar intrinsics rely on checking if only one bit is set
46902 // and implement it in C code like this:
46903 // A[0] = (U & 1) ? A[0] : W[0];
46904 // This creates some redundant instructions that break pattern matching.
46905 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
46906 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
46907 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
46908 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46909 SDValue AndNode = Cond.getOperand(0);
46910 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
46911 isNullConstant(Cond.getOperand(1)) &&
46912 isOneConstant(AndNode.getOperand(1))) {
46913 // LHS and RHS swapped due to
46914 // setcc outputting 1 when AND resulted in 0 and vice versa.
46915 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
46916 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
46917 }
46918 }
46919
46920 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
46921 // lowering on KNL. In this case we convert it to
46922 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
46923 // The same situation all vectors of i8 and i16 without BWI.
46924 // Make sure we extend these even before type legalization gets a chance to
46925 // split wide vectors.
46926 // Since SKX these selects have a proper lowering.
46927 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
46928 CondVT.getVectorElementType() == MVT::i1 &&
46929 (VT.getVectorElementType() == MVT::i8 ||
46930 VT.getVectorElementType() == MVT::i16)) {
46931 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
46932 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
46933 }
46934
46935 // AVX512 - Extend select with zero to merge with target shuffle.
46936 // select(mask, extract_subvector(shuffle(x)), zero) -->
46937 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
46938 // TODO - support non target shuffles as well.
46939 if (Subtarget.hasAVX512() && CondVT.isVector() &&
46940 CondVT.getVectorElementType() == MVT::i1) {
46941 auto SelectableOp = [&TLI](SDValue Op) {
46942 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46943 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
46944 isNullConstant(Op.getOperand(1)) &&
46945 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
46946 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
46947 };
46948
46949 bool SelectableLHS = SelectableOp(LHS);
46950 bool SelectableRHS = SelectableOp(RHS);
46951 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
46952 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
46953
46954 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
46955 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
46956 : RHS.getOperand(0).getValueType();
46957 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
46958 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
46959 VT.getSizeInBits());
46960 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
46961 VT.getSizeInBits());
46962 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
46963 DAG.getUNDEF(SrcCondVT), Cond,
46964 DAG.getIntPtrConstant(0, DL));
46965 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
46966 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
46967 }
46968 }
46969
46970 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
46971 return V;
46972
46973 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
46974 Cond.hasOneUse()) {
46975 EVT CondVT = Cond.getValueType();
46976 SDValue Cond0 = Cond.getOperand(0);
46977 SDValue Cond1 = Cond.getOperand(1);
46978 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46979
46980 // Canonicalize min/max:
46981 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
46982 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
46983 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
46984 // the need for an extra compare against zero. e.g.
46985 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
46986 // subl %esi, %edi
46987 // testl %edi, %edi
46988 // movl $0, %eax
46989 // cmovgl %edi, %eax
46990 // =>
46991 // xorl %eax, %eax
46992 // subl %esi, $edi
46993 // cmovsl %eax, %edi
46994 //
46995 // We can also canonicalize
46996 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
46997 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
46998 // This allows the use of a test instruction for the compare.
46999 if (LHS == Cond0 && RHS == Cond1) {
47000 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
47001 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
47002 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
47003 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47004 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47005 }
47006 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
47007 ISD::CondCode NewCC = ISD::SETUGE;
47008 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47009 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47010 }
47011 }
47012
47013 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
47014 // fold eq + gt/lt nested selects into ge/le selects
47015 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
47016 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47017 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
47018 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47019 // .. etc ..
47020 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
47021 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
47022 SDValue InnerSetCC = RHS.getOperand(0);
47023 ISD::CondCode InnerCC =
47024 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47025 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
47026 Cond0 == InnerSetCC.getOperand(0) &&
47027 Cond1 == InnerSetCC.getOperand(1)) {
47028 ISD::CondCode NewCC;
47029 switch (CC == ISD::SETEQ ? InnerCC : CC) {
47030 case ISD::SETGT: NewCC = ISD::SETGE; break;
47031 case ISD::SETLT: NewCC = ISD::SETLE; break;
47032 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
47033 case ISD::SETULT: NewCC = ISD::SETULE; break;
47034 default: NewCC = ISD::SETCC_INVALID; break;
47035 }
47036 if (NewCC != ISD::SETCC_INVALID) {
47037 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
47038 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
47039 }
47040 }
47041 }
47042 }
47043
47044 // Check if the first operand is all zeros and Cond type is vXi1.
47045 // If this an avx512 target we can improve the use of zero masking by
47046 // swapping the operands and inverting the condition.
47047 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47048 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
47049 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
47050 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
47051 // Invert the cond to not(cond) : xor(op,allones)=not(op)
47052 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
47053 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
47054 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
47055 }
47056
47057 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
47058 // get split by legalization.
47059 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47060 CondVT.getVectorElementType() == MVT::i1 &&
47061 TLI.isTypeLegal(VT.getScalarType())) {
47062 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
47063 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
47064 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
47065 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
47066 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
47067 }
47068 }
47069
47070 // Early exit check
47071 if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
47072 return SDValue();
47073
47074 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
47075 return V;
47076
47077 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
47078 return V;
47079
47080 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
47081 return V;
47082
47083 // select(~Cond, X, Y) -> select(Cond, Y, X)
47084 if (CondVT.getScalarType() != MVT::i1) {
47085 if (SDValue CondNot = IsNOT(Cond, DAG))
47086 return DAG.getNode(N->getOpcode(), DL, VT,
47087 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
47088
47089 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47090 // signbit.
47091 if (Cond.getOpcode() == X86ISD::PCMPGT &&
47092 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
47093 Cond.hasOneUse()) {
47094 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
47095 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
47096 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47097 }
47098 }
47099
47100 // Try to optimize vXi1 selects if both operands are either all constants or
47101 // bitcasts from scalar integer type. In that case we can convert the operands
47102 // to integer and use an integer select which will be converted to a CMOV.
47103 // We need to take a little bit of care to avoid creating an i64 type after
47104 // type legalization.
47105 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47106 VT.getVectorElementType() == MVT::i1 &&
47107 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
47108 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
47109 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
47110 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
47111 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
47112
47113 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
47114 LHS.getOperand(0).getValueType() == IntVT)) &&
47115 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
47116 RHS.getOperand(0).getValueType() == IntVT))) {
47117 if (LHSIsConst)
47118 LHS = combinevXi1ConstantToInteger(LHS, DAG);
47119 else
47120 LHS = LHS.getOperand(0);
47121
47122 if (RHSIsConst)
47123 RHS = combinevXi1ConstantToInteger(RHS, DAG);
47124 else
47125 RHS = RHS.getOperand(0);
47126
47127 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
47128 return DAG.getBitcast(VT, Select);
47129 }
47130 }
47131 }
47132
47133 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
47134 // single bits, then invert the predicate and swap the select operands.
47135 // This can lower using a vector shift bit-hack rather than mask and compare.
47136 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
47137 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47138 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
47139 Cond.getOperand(0).getOpcode() == ISD::AND &&
47140 isNullOrNullSplat(Cond.getOperand(1)) &&
47141 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47142 Cond.getOperand(0).getValueType() == VT) {
47143 // The 'and' mask must be composed of power-of-2 constants.
47144 SDValue And = Cond.getOperand(0);
47145 auto *C = isConstOrConstSplat(And.getOperand(1));
47146 if (C && C->getAPIntValue().isPowerOf2()) {
47147 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47148 SDValue NotCond =
47149 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
47150 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
47151 }
47152
47153 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47154 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47155 // 16-bit lacks a proper blendv.
47156 unsigned EltBitWidth = VT.getScalarSizeInBits();
47157 bool CanShiftBlend =
47158 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
47159 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
47160 (Subtarget.hasXOP()));
47161 if (CanShiftBlend &&
47162 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
47163 return C->getAPIntValue().isPowerOf2();
47164 })) {
47165 // Create a left-shift constant to get the mask bits over to the sign-bit.
47166 SDValue Mask = And.getOperand(1);
47167 SmallVector<int, 32> ShlVals;
47168 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
47169 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
47170 ShlVals.push_back(EltBitWidth - 1 -
47171 MaskVal->getAPIntValue().exactLogBase2());
47172 }
47173 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47174 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
47175 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
47176 SDValue NewCond =
47177 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
47178 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
47179 }
47180 }
47181
47182 return SDValue();
47183}
47184
47185/// Combine:
47186/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
47187/// to:
47188/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
47189/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
47190/// Note that this is only legal for some op/cc combinations.
47191static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
47192 SelectionDAG &DAG,
47193 const X86Subtarget &Subtarget) {
47194 // This combine only operates on CMP-like nodes.
47195 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47196 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47197 return SDValue();
47198
47199 // Can't replace the cmp if it has more uses than the one we're looking at.
47200 // FIXME: We would like to be able to handle this, but would need to make sure
47201 // all uses were updated.
47202 if (!Cmp.hasOneUse())
47203 return SDValue();
47204
47205 // This only applies to variations of the common case:
47206 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47207 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47208 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47209 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47210 // Using the proper condcodes (see below), overflow is checked for.
47211
47212 // FIXME: We can generalize both constraints:
47213 // - XOR/OR/AND (if they were made to survive AtomicExpand)
47214 // - LHS != 1
47215 // if the result is compared.
47216
47217 SDValue CmpLHS = Cmp.getOperand(0);
47218 SDValue CmpRHS = Cmp.getOperand(1);
47219 EVT CmpVT = CmpLHS.getValueType();
47220
47221 if (!CmpLHS.hasOneUse())
47222 return SDValue();
47223
47224 unsigned Opc = CmpLHS.getOpcode();
47225 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
47226 return SDValue();
47227
47228 SDValue OpRHS = CmpLHS.getOperand(2);
47229 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
47230 if (!OpRHSC)
47231 return SDValue();
47232
47233 APInt Addend = OpRHSC->getAPIntValue();
47234 if (Opc == ISD::ATOMIC_LOAD_SUB)
47235 Addend = -Addend;
47236
47237 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
47238 if (!CmpRHSC)
47239 return SDValue();
47240
47241 APInt Comparison = CmpRHSC->getAPIntValue();
47242 APInt NegAddend = -Addend;
47243
47244 // See if we can adjust the CC to make the comparison match the negated
47245 // addend.
47246 if (Comparison != NegAddend) {
47247 APInt IncComparison = Comparison + 1;
47248 if (IncComparison == NegAddend) {
47249 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
47250 Comparison = IncComparison;
47251 CC = X86::COND_AE;
47252 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
47253 Comparison = IncComparison;
47254 CC = X86::COND_L;
47255 }
47256 }
47257 APInt DecComparison = Comparison - 1;
47258 if (DecComparison == NegAddend) {
47259 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
47260 Comparison = DecComparison;
47261 CC = X86::COND_A;
47262 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
47263 Comparison = DecComparison;
47264 CC = X86::COND_LE;
47265 }
47266 }
47267 }
47268
47269 // If the addend is the negation of the comparison value, then we can do
47270 // a full comparison by emitting the atomic arithmetic as a locked sub.
47271 if (Comparison == NegAddend) {
47272 // The CC is fine, but we need to rewrite the LHS of the comparison as an
47273 // atomic sub.
47274 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
47275 auto AtomicSub = DAG.getAtomic(
47276 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
47277 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
47278 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
47279 AN->getMemOperand());
47280 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47281 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47282 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47283 return LockOp;
47284 }
47285
47286 // We can handle comparisons with zero in a number of cases by manipulating
47287 // the CC used.
47288 if (!Comparison.isZero())
47289 return SDValue();
47290
47291 if (CC == X86::COND_S && Addend == 1)
47292 CC = X86::COND_LE;
47293 else if (CC == X86::COND_NS && Addend == 1)
47294 CC = X86::COND_G;
47295 else if (CC == X86::COND_G && Addend == -1)
47296 CC = X86::COND_GE;
47297 else if (CC == X86::COND_LE && Addend == -1)
47298 CC = X86::COND_L;
47299 else
47300 return SDValue();
47301
47302 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47303 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47304 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47305 return LockOp;
47306}
47307
47308// Check whether a boolean test is testing a boolean value generated by
47309// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47310// code.
47311//
47312// Simplify the following patterns:
47313// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47314// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47315// to (Op EFLAGS Cond)
47316//
47317// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47318// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47319// to (Op EFLAGS !Cond)
47320//
47321// where Op could be BRCOND or CMOV.
47322//
47323static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
47324 // This combine only operates on CMP-like nodes.
47325 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47326 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47327 return SDValue();
47328
47329 // Quit if not used as a boolean value.
47330 if (CC != X86::COND_E && CC != X86::COND_NE)
47331 return SDValue();
47332
47333 // Check CMP operands. One of them should be 0 or 1 and the other should be
47334 // an SetCC or extended from it.
47335 SDValue Op1 = Cmp.getOperand(0);
47336 SDValue Op2 = Cmp.getOperand(1);
47337
47338 SDValue SetCC;
47339 const ConstantSDNode* C = nullptr;
47340 bool needOppositeCond = (CC == X86::COND_E);
47341 bool checkAgainstTrue = false; // Is it a comparison against 1?
47342
47343 if ((C = dyn_cast<ConstantSDNode>(Op1)))
47344 SetCC = Op2;
47345 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47346 SetCC = Op1;
47347 else // Quit if all operands are not constants.
47348 return SDValue();
47349
47350 if (C->getZExtValue() == 1) {
47351 needOppositeCond = !needOppositeCond;
47352 checkAgainstTrue = true;
47353 } else if (C->getZExtValue() != 0)
47354 // Quit if the constant is neither 0 or 1.
47355 return SDValue();
47356
47357 bool truncatedToBoolWithAnd = false;
47358 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47359 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47360 SetCC.getOpcode() == ISD::TRUNCATE ||
47361 SetCC.getOpcode() == ISD::AND) {
47362 if (SetCC.getOpcode() == ISD::AND) {
47363 int OpIdx = -1;
47364 if (isOneConstant(SetCC.getOperand(0)))
47365 OpIdx = 1;
47366 if (isOneConstant(SetCC.getOperand(1)))
47367 OpIdx = 0;
47368 if (OpIdx < 0)
47369 break;
47370 SetCC = SetCC.getOperand(OpIdx);
47371 truncatedToBoolWithAnd = true;
47372 } else
47373 SetCC = SetCC.getOperand(0);
47374 }
47375
47376 switch (SetCC.getOpcode()) {
47377 case X86ISD::SETCC_CARRY:
47378 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47379 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47380 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47381 // truncated to i1 using 'and'.
47382 if (checkAgainstTrue && !truncatedToBoolWithAnd)
47383 break;
47384 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__
__PRETTY_FUNCTION__))
47385 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__
__PRETTY_FUNCTION__))
;
47386 [[fallthrough]];
47387 case X86ISD::SETCC:
47388 // Set the condition code or opposite one if necessary.
47389 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
47390 if (needOppositeCond)
47391 CC = X86::GetOppositeBranchCondition(CC);
47392 return SetCC.getOperand(1);
47393 case X86ISD::CMOV: {
47394 // Check whether false/true value has canonical one, i.e. 0 or 1.
47395 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47396 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47397 // Quit if true value is not a constant.
47398 if (!TVal)
47399 return SDValue();
47400 // Quit if false value is not a constant.
47401 if (!FVal) {
47402 SDValue Op = SetCC.getOperand(0);
47403 // Skip 'zext' or 'trunc' node.
47404 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47405 Op.getOpcode() == ISD::TRUNCATE)
47406 Op = Op.getOperand(0);
47407 // A special case for rdrand/rdseed, where 0 is set if false cond is
47408 // found.
47409 if ((Op.getOpcode() != X86ISD::RDRAND &&
47410 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47411 return SDValue();
47412 }
47413 // Quit if false value is not the constant 0 or 1.
47414 bool FValIsFalse = true;
47415 if (FVal && FVal->getZExtValue() != 0) {
47416 if (FVal->getZExtValue() != 1)
47417 return SDValue();
47418 // If FVal is 1, opposite cond is needed.
47419 needOppositeCond = !needOppositeCond;
47420 FValIsFalse = false;
47421 }
47422 // Quit if TVal is not the constant opposite of FVal.
47423 if (FValIsFalse && TVal->getZExtValue() != 1)
47424 return SDValue();
47425 if (!FValIsFalse && TVal->getZExtValue() != 0)
47426 return SDValue();
47427 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
47428 if (needOppositeCond)
47429 CC = X86::GetOppositeBranchCondition(CC);
47430 return SetCC.getOperand(3);
47431 }
47432 }
47433
47434 return SDValue();
47435}
47436
47437/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47438/// Match:
47439/// (X86or (X86setcc) (X86setcc))
47440/// (X86cmp (and (X86setcc) (X86setcc)), 0)
47441static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
47442 X86::CondCode &CC1, SDValue &Flags,
47443 bool &isAnd) {
47444 if (Cond->getOpcode() == X86ISD::CMP) {
47445 if (!isNullConstant(Cond->getOperand(1)))
47446 return false;
47447
47448 Cond = Cond->getOperand(0);
47449 }
47450
47451 isAnd = false;
47452
47453 SDValue SetCC0, SetCC1;
47454 switch (Cond->getOpcode()) {
47455 default: return false;
47456 case ISD::AND:
47457 case X86ISD::AND:
47458 isAnd = true;
47459 [[fallthrough]];
47460 case ISD::OR:
47461 case X86ISD::OR:
47462 SetCC0 = Cond->getOperand(0);
47463 SetCC1 = Cond->getOperand(1);
47464 break;
47465 };
47466
47467 // Make sure we have SETCC nodes, using the same flags value.
47468 if (SetCC0.getOpcode() != X86ISD::SETCC ||
47469 SetCC1.getOpcode() != X86ISD::SETCC ||
47470 SetCC0->getOperand(1) != SetCC1->getOperand(1))
47471 return false;
47472
47473 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47474 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47475 Flags = SetCC0->getOperand(1);
47476 return true;
47477}
47478
47479// When legalizing carry, we create carries via add X, -1
47480// If that comes from an actual carry, via setcc, we use the
47481// carry directly.
47482static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
47483 if (EFLAGS.getOpcode() == X86ISD::ADD) {
47484 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
47485 bool FoundAndLSB = false;
47486 SDValue Carry = EFLAGS.getOperand(0);
47487 while (Carry.getOpcode() == ISD::TRUNCATE ||
47488 Carry.getOpcode() == ISD::ZERO_EXTEND ||
47489 (Carry.getOpcode() == ISD::AND &&
47490 isOneConstant(Carry.getOperand(1)))) {
47491 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
47492 Carry = Carry.getOperand(0);
47493 }
47494 if (Carry.getOpcode() == X86ISD::SETCC ||
47495 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
47496 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
47497 uint64_t CarryCC = Carry.getConstantOperandVal(0);
47498 SDValue CarryOp1 = Carry.getOperand(1);
47499 if (CarryCC == X86::COND_B)
47500 return CarryOp1;
47501 if (CarryCC == X86::COND_A) {
47502 // Try to convert COND_A into COND_B in an attempt to facilitate
47503 // materializing "setb reg".
47504 //
47505 // Do not flip "e > c", where "c" is a constant, because Cmp
47506 // instruction cannot take an immediate as its first operand.
47507 //
47508 if (CarryOp1.getOpcode() == X86ISD::SUB &&
47509 CarryOp1.getNode()->hasOneUse() &&
47510 CarryOp1.getValueType().isInteger() &&
47511 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
47512 SDValue SubCommute =
47513 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
47514 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
47515 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
47516 }
47517 }
47518 // If this is a check of the z flag of an add with 1, switch to the
47519 // C flag.
47520 if (CarryCC == X86::COND_E &&
47521 CarryOp1.getOpcode() == X86ISD::ADD &&
47522 isOneConstant(CarryOp1.getOperand(1)))
47523 return CarryOp1;
47524 } else if (FoundAndLSB) {
47525 SDLoc DL(Carry);
47526 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
47527 if (Carry.getOpcode() == ISD::SRL) {
47528 BitNo = Carry.getOperand(1);
47529 Carry = Carry.getOperand(0);
47530 }
47531 return getBT(Carry, BitNo, DL, DAG);
47532 }
47533 }
47534 }
47535
47536 return SDValue();
47537}
47538
47539/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
47540/// to avoid the inversion.
47541static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
47542 SelectionDAG &DAG,
47543 const X86Subtarget &Subtarget) {
47544 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
47545 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
47546 EFLAGS.getOpcode() != X86ISD::TESTP)
47547 return SDValue();
47548
47549 // PTEST/TESTP sets EFLAGS as:
47550 // TESTZ: ZF = (Op0 & Op1) == 0
47551 // TESTC: CF = (~Op0 & Op1) == 0
47552 // TESTNZC: ZF == 0 && CF == 0
47553 MVT VT = EFLAGS.getSimpleValueType();
47554 SDValue Op0 = EFLAGS.getOperand(0);
47555 SDValue Op1 = EFLAGS.getOperand(1);
47556 MVT OpVT = Op0.getSimpleValueType();
47557
47558 // TEST*(~X,Y) == TEST*(X,Y)
47559 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
47560 X86::CondCode InvCC;
47561 switch (CC) {
47562 case X86::COND_B:
47563 // testc -> testz.
47564 InvCC = X86::COND_E;
47565 break;
47566 case X86::COND_AE:
47567 // !testc -> !testz.
47568 InvCC = X86::COND_NE;
47569 break;
47570 case X86::COND_E:
47571 // testz -> testc.
47572 InvCC = X86::COND_B;
47573 break;
47574 case X86::COND_NE:
47575 // !testz -> !testc.
47576 InvCC = X86::COND_AE;
47577 break;
47578 case X86::COND_A:
47579 case X86::COND_BE:
47580 // testnzc -> testnzc (no change).
47581 InvCC = CC;
47582 break;
47583 default:
47584 InvCC = X86::COND_INVALID;
47585 break;
47586 }
47587
47588 if (InvCC != X86::COND_INVALID) {
47589 CC = InvCC;
47590 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47591 DAG.getBitcast(OpVT, NotOp0), Op1);
47592 }
47593 }
47594
47595 if (CC == X86::COND_B || CC == X86::COND_AE) {
47596 // TESTC(X,~X) == TESTC(X,-1)
47597 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47598 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
47599 SDLoc DL(EFLAGS);
47600 return DAG.getNode(
47601 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
47602 DAG.getBitcast(OpVT,
47603 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
47604 }
47605 }
47606 }
47607
47608 if (CC == X86::COND_E || CC == X86::COND_NE) {
47609 // TESTZ(X,~Y) == TESTC(Y,X)
47610 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47611 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47612 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47613 DAG.getBitcast(OpVT, NotOp1), Op0);
47614 }
47615
47616 if (Op0 == Op1) {
47617 SDValue BC = peekThroughBitcasts(Op0);
47618 EVT BCVT = BC.getValueType();
47619
47620 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
47621 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
47622 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47623 DAG.getBitcast(OpVT, BC.getOperand(0)),
47624 DAG.getBitcast(OpVT, BC.getOperand(1)));
47625 }
47626
47627 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
47628 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
47629 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47630 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47631 DAG.getBitcast(OpVT, BC.getOperand(0)),
47632 DAG.getBitcast(OpVT, BC.getOperand(1)));
47633 }
47634
47635 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
47636 // to more efficiently extract the sign bits and compare that.
47637 // TODO: Handle TESTC with comparison inversion.
47638 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
47639 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
47640 if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
47641 unsigned EltBits = BCVT.getScalarSizeInBits();
47642 if (DAG.ComputeNumSignBits(BC) == EltBits) {
47643 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47643, __extension__
__PRETTY_FUNCTION__))
;
47644 APInt SignMask = APInt::getSignMask(EltBits);
47645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47646 if (SDValue Res =
47647 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
47648 // For vXi16 cases we need to use pmovmksb and extract every other
47649 // sign bit.
47650 SDLoc DL(EFLAGS);
47651 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
47652 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
47653 MVT FloatVT =
47654 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
47655 Res = DAG.getBitcast(FloatVT, Res);
47656 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
47657 } else if (EltBits == 16) {
47658 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
47659 Res = DAG.getBitcast(MovmskVT, Res);
47660 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47661 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
47662 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47663 } else {
47664 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47665 }
47666 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
47667 DAG.getConstant(0, DL, MVT::i32));
47668 }
47669 }
47670 }
47671 }
47672
47673 // TESTZ(-1,X) == TESTZ(X,X)
47674 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
47675 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
47676
47677 // TESTZ(X,-1) == TESTZ(X,X)
47678 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
47679 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
47680
47681 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
47682 // TODO: Add COND_NE handling?
47683 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
47684 SDValue Src0 = peekThroughBitcasts(Op0);
47685 SDValue Src1 = peekThroughBitcasts(Op1);
47686 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
47687 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
47688 peekThroughBitcasts(Src0.getOperand(1)), true);
47689 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
47690 peekThroughBitcasts(Src1.getOperand(1)), true);
47691 if (Src0 && Src1) {
47692 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
47693 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47694 DAG.getBitcast(OpVT2, Src0),
47695 DAG.getBitcast(OpVT2, Src1));
47696 }
47697 }
47698 }
47699 }
47700
47701 return SDValue();
47702}
47703
47704// Attempt to simplify the MOVMSK input based on the comparison type.
47705static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
47706 SelectionDAG &DAG,
47707 const X86Subtarget &Subtarget) {
47708 // Handle eq/ne against zero (any_of).
47709 // Handle eq/ne against -1 (all_of).
47710 if (!(CC == X86::COND_E || CC == X86::COND_NE))
47711 return SDValue();
47712 if (EFLAGS.getValueType() != MVT::i32)
47713 return SDValue();
47714 unsigned CmpOpcode = EFLAGS.getOpcode();
47715 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
47716 return SDValue();
47717 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
47718 if (!CmpConstant)
47719 return SDValue();
47720 const APInt &CmpVal = CmpConstant->getAPIntValue();
47721
47722 SDValue CmpOp = EFLAGS.getOperand(0);
47723 unsigned CmpBits = CmpOp.getValueSizeInBits();
47724 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__
__PRETTY_FUNCTION__))
;
47725
47726 // Peek through any truncate.
47727 if (CmpOp.getOpcode() == ISD::TRUNCATE)
47728 CmpOp = CmpOp.getOperand(0);
47729
47730 // Bail if we don't find a MOVMSK.
47731 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
47732 return SDValue();
47733
47734 SDValue Vec = CmpOp.getOperand(0);
47735 MVT VecVT = Vec.getSimpleValueType();
47736 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__
__PRETTY_FUNCTION__))
47737 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__
__PRETTY_FUNCTION__))
;
47738 unsigned NumElts = VecVT.getVectorNumElements();
47739 unsigned NumEltBits = VecVT.getScalarSizeInBits();
47740
47741 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
47742 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
47743 NumElts <= CmpBits && CmpVal.isMask(NumElts);
47744 if (!IsAnyOf && !IsAllOf)
47745 return SDValue();
47746
47747 // TODO: Check more combining cases for me.
47748 // Here we check the cmp use number to decide do combining or not.
47749 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
47750 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
47751 bool IsOneUse = CmpOp.getNode()->hasOneUse();
47752
47753 // See if we can peek through to a vector with a wider element type, if the
47754 // signbits extend down to all the sub-elements as well.
47755 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
47756 // potential SimplifyDemandedBits/Elts cases.
47757 // If we looked through a truncate that discard bits, we can't do this
47758 // transform.
47759 // FIXME: We could do this transform for truncates that discarded bits by
47760 // inserting an AND mask between the new MOVMSK and the CMP.
47761 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
47762 SDValue BC = peekThroughBitcasts(Vec);
47763 MVT BCVT = BC.getSimpleValueType();
47764 unsigned BCNumElts = BCVT.getVectorNumElements();
47765 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
47766 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
47767 BCNumEltBits > NumEltBits &&
47768 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
47769 SDLoc DL(EFLAGS);
47770 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
47771 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47772 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
47773 DAG.getConstant(CmpMask, DL, MVT::i32));
47774 }
47775 }
47776
47777 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
47778 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
47779 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
47780 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
47781 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
47782 SmallVector<SDValue> Ops;
47783 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
47784 Ops.size() == 2) {
47785 SDLoc DL(EFLAGS);
47786 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
47787 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
47788 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
47789 DAG.getBitcast(SubVT, Ops[0]),
47790 DAG.getBitcast(SubVT, Ops[1]));
47791 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
47792 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47793 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
47794 DAG.getConstant(CmpMask, DL, MVT::i32));
47795 }
47796 }
47797
47798 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
47799 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
47800 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
47801 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
47802 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
47803 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
47804 SDValue BC = peekThroughBitcasts(Vec);
47805 // Ensure MOVMSK was testing every signbit of BC.
47806 if (BC.getValueType().getVectorNumElements() <= NumElts) {
47807 if (BC.getOpcode() == X86ISD::PCMPEQ) {
47808 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
47809 BC.getOperand(0), BC.getOperand(1));
47810 V = DAG.getBitcast(TestVT, V);
47811 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47812 }
47813 // Check for 256-bit split vector cases.
47814 if (BC.getOpcode() == ISD::AND &&
47815 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
47816 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
47817 SDValue LHS = BC.getOperand(0);
47818 SDValue RHS = BC.getOperand(1);
47819 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
47820 LHS.getOperand(0), LHS.getOperand(1));
47821 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
47822 RHS.getOperand(0), RHS.getOperand(1));
47823 LHS = DAG.getBitcast(TestVT, LHS);
47824 RHS = DAG.getBitcast(TestVT, RHS);
47825 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
47826 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47827 }
47828 }
47829 }
47830
47831 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
47832 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
47833 // sign bits prior to the comparison with zero unless we know that
47834 // the vXi16 splats the sign bit down to the lower i8 half.
47835 // TODO: Handle all_of patterns.
47836 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
47837 SDValue VecOp0 = Vec.getOperand(0);
47838 SDValue VecOp1 = Vec.getOperand(1);
47839 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
47840 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
47841 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
47842 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
47843 SDLoc DL(EFLAGS);
47844 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
47845 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47846 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
47847 if (!SignExt0) {
47848 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
47849 DAG.getConstant(0xAAAA, DL, MVT::i16));
47850 }
47851 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47852 DAG.getConstant(0, DL, MVT::i16));
47853 }
47854 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
47855 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
47856 if (CmpBits >= 16 && Subtarget.hasInt256() &&
47857 (IsAnyOf || (SignExt0 && SignExt1))) {
47858 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
47859 SDLoc DL(EFLAGS);
47860 SDValue Result = peekThroughBitcasts(Src);
47861 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
47862 Result.getValueType().getVectorNumElements() <= NumElts) {
47863 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
47864 Result.getOperand(0), Result.getOperand(1));
47865 V = DAG.getBitcast(MVT::v4i64, V);
47866 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47867 }
47868 Result = DAG.getBitcast(MVT::v32i8, Result);
47869 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47870 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
47871 if (!SignExt0 || !SignExt1) {
47872 assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__
__PRETTY_FUNCTION__))
47873 "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__
__PRETTY_FUNCTION__))
;
47874 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
47875 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47876 }
47877 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47878 DAG.getConstant(CmpMask, DL, MVT::i32));
47879 }
47880 }
47881 }
47882
47883 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
47884 SmallVector<int, 32> ShuffleMask;
47885 SmallVector<SDValue, 2> ShuffleInputs;
47886 if (NumElts <= CmpBits &&
47887 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
47888 ShuffleMask, DAG) &&
47889 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
47890 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
47891 unsigned NumShuffleElts = ShuffleMask.size();
47892 APInt DemandedElts = APInt::getZero(NumShuffleElts);
47893 for (int M : ShuffleMask) {
47894 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47894, __extension__
__PRETTY_FUNCTION__))
;
47895 DemandedElts.setBit(M);
47896 }
47897 if (DemandedElts.isAllOnes()) {
47898 SDLoc DL(EFLAGS);
47899 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
47900 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47901 Result =
47902 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
47903 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47904 EFLAGS.getOperand(1));
47905 }
47906 }
47907
47908 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
47909 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
47910 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
47911 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
47912 // iff every element is referenced.
47913 if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse &&
47914 (NumEltBits == 32 || NumEltBits == 64)) {
47915 SDLoc DL(EFLAGS);
47916 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
47917 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
47918 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
47919 SDValue LHS = Vec;
47920 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
47921 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47922 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
47923 DAG.getBitcast(FloatVT, LHS),
47924 DAG.getBitcast(FloatVT, RHS));
47925 }
47926
47927 return SDValue();
47928}
47929
47930/// Optimize an EFLAGS definition used according to the condition code \p CC
47931/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
47932/// uses of chain values.
47933static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
47934 SelectionDAG &DAG,
47935 const X86Subtarget &Subtarget) {
47936 if (CC == X86::COND_B)
47937 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
47938 return Flags;
47939
47940 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
47941 return R;
47942
47943 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
47944 return R;
47945
47946 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
47947 return R;
47948
47949 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
47950}
47951
47952/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
47953static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
47954 TargetLowering::DAGCombinerInfo &DCI,
47955 const X86Subtarget &Subtarget) {
47956 SDLoc DL(N);
47957
47958 SDValue FalseOp = N->getOperand(0);
47959 SDValue TrueOp = N->getOperand(1);
47960 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
47961 SDValue Cond = N->getOperand(3);
47962
47963 // cmov X, X, ?, ? --> X
47964 if (TrueOp == FalseOp)
47965 return TrueOp;
47966
47967 // Try to simplify the EFLAGS and condition code operands.
47968 // We can't always do this as FCMOV only supports a subset of X86 cond.
47969 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
47970 if (!(FalseOp.getValueType() == MVT::f80 ||
47971 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
47972 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
47973 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
47974 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
47975 Flags};
47976 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47977 }
47978 }
47979
47980 // If this is a select between two integer constants, try to do some
47981 // optimizations. Note that the operands are ordered the opposite of SELECT
47982 // operands.
47983 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
47984 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
47985 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
47986 // larger than FalseC (the false value).
47987 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
47988 CC = X86::GetOppositeBranchCondition(CC);
47989 std::swap(TrueC, FalseC);
47990 std::swap(TrueOp, FalseOp);
47991 }
47992
47993 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
47994 // This is efficient for any integer data type (including i8/i16) and
47995 // shift amount.
47996 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
47997 Cond = getSETCC(CC, Cond, DL, DAG);
47998
47999 // Zero extend the condition if needed.
48000 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48001
48002 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48003 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
48004 DAG.getConstant(ShAmt, DL, MVT::i8));
48005 return Cond;
48006 }
48007
48008 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
48009 // for any integer data type, including i8/i16.
48010 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48011 Cond = getSETCC(CC, Cond, DL, DAG);
48012
48013 // Zero extend the condition if needed.
48014 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
48015 FalseC->getValueType(0), Cond);
48016 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48017 SDValue(FalseC, 0));
48018 return Cond;
48019 }
48020
48021 // Optimize cases that will turn into an LEA instruction. This requires
48022 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
48023 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
48024 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48025 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__
__PRETTY_FUNCTION__))
48026 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__
__PRETTY_FUNCTION__))
;
48027
48028 bool isFastMultiplier = false;
48029 if (Diff.ult(10)) {
48030 switch (Diff.getZExtValue()) {
48031 default: break;
48032 case 1: // result = add base, cond
48033 case 2: // result = lea base( , cond*2)
48034 case 3: // result = lea base(cond, cond*2)
48035 case 4: // result = lea base( , cond*4)
48036 case 5: // result = lea base(cond, cond*4)
48037 case 8: // result = lea base( , cond*8)
48038 case 9: // result = lea base(cond, cond*8)
48039 isFastMultiplier = true;
48040 break;
48041 }
48042 }
48043
48044 if (isFastMultiplier) {
48045 Cond = getSETCC(CC, Cond, DL ,DAG);
48046 // Zero extend the condition if needed.
48047 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48048 Cond);
48049 // Scale the condition by the difference.
48050 if (Diff != 1)
48051 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
48052 DAG.getConstant(Diff, DL, Cond.getValueType()));
48053
48054 // Add the base if non-zero.
48055 if (FalseC->getAPIntValue() != 0)
48056 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48057 SDValue(FalseC, 0));
48058 return Cond;
48059 }
48060 }
48061 }
48062 }
48063
48064 // Handle these cases:
48065 // (select (x != c), e, c) -> select (x != c), e, x),
48066 // (select (x == c), c, e) -> select (x == c), x, e)
48067 // where the c is an integer constant, and the "select" is the combination
48068 // of CMOV and CMP.
48069 //
48070 // The rationale for this change is that the conditional-move from a constant
48071 // needs two instructions, however, conditional-move from a register needs
48072 // only one instruction.
48073 //
48074 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
48075 // some instruction-combining opportunities. This opt needs to be
48076 // postponed as late as possible.
48077 //
48078 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
48079 // the DCI.xxxx conditions are provided to postpone the optimization as
48080 // late as possible.
48081
48082 ConstantSDNode *CmpAgainst = nullptr;
48083 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
48084 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
48085 !isa<ConstantSDNode>(Cond.getOperand(0))) {
48086
48087 if (CC == X86::COND_NE &&
48088 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
48089 CC = X86::GetOppositeBranchCondition(CC);
48090 std::swap(TrueOp, FalseOp);
48091 }
48092
48093 if (CC == X86::COND_E &&
48094 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
48095 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
48096 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
48097 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48098 }
48099 }
48100 }
48101
48102 // Transform:
48103 //
48104 // (cmov 1 T (uge T 2))
48105 //
48106 // to:
48107 //
48108 // (adc T 0 (sub T 1))
48109 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
48110 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48111 SDValue Cond0 = Cond.getOperand(0);
48112 if (Cond0.getOpcode() == ISD::TRUNCATE)
48113 Cond0 = Cond0.getOperand(0);
48114 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
48115 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48116 EVT CondVT = Cond->getValueType(0);
48117 EVT OuterVT = N->getValueType(0);
48118 // Subtract 1 and generate a carry.
48119 SDValue NewSub =
48120 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48121 DAG.getConstant(1, DL, CondVT));
48122 SDValue EFLAGS(NewSub.getNode(), 1);
48123 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
48124 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
48125 }
48126 }
48127
48128 // Fold and/or of setcc's to double CMOV:
48129 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48130 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48131 //
48132 // This combine lets us generate:
48133 // cmovcc1 (jcc1 if we don't have CMOV)
48134 // cmovcc2 (same)
48135 // instead of:
48136 // setcc1
48137 // setcc2
48138 // and/or
48139 // cmovne (jne if we don't have CMOV)
48140 // When we can't use the CMOV instruction, it might increase branch
48141 // mispredicts.
48142 // When we can use CMOV, or when there is no mispredict, this improves
48143 // throughput and reduces register pressure.
48144 //
48145 if (CC == X86::COND_NE) {
48146 SDValue Flags;
48147 X86::CondCode CC0, CC1;
48148 bool isAndSetCC;
48149 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
48150 if (isAndSetCC) {
48151 std::swap(FalseOp, TrueOp);
48152 CC0 = X86::GetOppositeBranchCondition(CC0);
48153 CC1 = X86::GetOppositeBranchCondition(CC1);
48154 }
48155
48156 SDValue LOps[] = {FalseOp, TrueOp,
48157 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
48158 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
48159 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
48160 Flags};
48161 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48162 return CMOV;
48163 }
48164 }
48165
48166 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48167 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48168 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48169 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48170 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
48171 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
48172 SDValue Add = TrueOp;
48173 SDValue Const = FalseOp;
48174 // Canonicalize the condition code for easier matching and output.
48175 if (CC == X86::COND_E)
48176 std::swap(Add, Const);
48177
48178 // We might have replaced the constant in the cmov with the LHS of the
48179 // compare. If so change it to the RHS of the compare.
48180 if (Const == Cond.getOperand(0))
48181 Const = Cond.getOperand(1);
48182
48183 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
48184 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
48185 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
48186 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
48187 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
48188 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
48189 EVT VT = N->getValueType(0);
48190 // This should constant fold.
48191 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
48192 SDValue CMov =
48193 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
48194 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
48195 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
48196 }
48197 }
48198
48199 return SDValue();
48200}
48201
48202/// Different mul shrinking modes.
48203enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
48204
48205static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
48206 EVT VT = N->getOperand(0).getValueType();
48207 if (VT.getScalarSizeInBits() != 32)
48208 return false;
48209
48210 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48210, __extension__
__PRETTY_FUNCTION__))
;
48211 unsigned SignBits[2] = {1, 1};
48212 bool IsPositive[2] = {false, false};
48213 for (unsigned i = 0; i < 2; i++) {
48214 SDValue Opd = N->getOperand(i);
48215
48216 SignBits[i] = DAG.ComputeNumSignBits(Opd);
48217 IsPositive[i] = DAG.SignBitIsZero(Opd);
48218 }
48219
48220 bool AllPositive = IsPositive[0] && IsPositive[1];
48221 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
48222 // When ranges are from -128 ~ 127, use MULS8 mode.
48223 if (MinSignBits >= 25)
48224 Mode = ShrinkMode::MULS8;
48225 // When ranges are from 0 ~ 255, use MULU8 mode.
48226 else if (AllPositive && MinSignBits >= 24)
48227 Mode = ShrinkMode::MULU8;
48228 // When ranges are from -32768 ~ 32767, use MULS16 mode.
48229 else if (MinSignBits >= 17)
48230 Mode = ShrinkMode::MULS16;
48231 // When ranges are from 0 ~ 65535, use MULU16 mode.
48232 else if (AllPositive && MinSignBits >= 16)
48233 Mode = ShrinkMode::MULU16;
48234 else
48235 return false;
48236 return true;
48237}
48238
48239/// When the operands of vector mul are extended from smaller size values,
48240/// like i8 and i16, the type of mul may be shrinked to generate more
48241/// efficient code. Two typical patterns are handled:
48242/// Pattern1:
48243/// %2 = sext/zext <N x i8> %1 to <N x i32>
48244/// %4 = sext/zext <N x i8> %3 to <N x i32>
48245// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48246/// %5 = mul <N x i32> %2, %4
48247///
48248/// Pattern2:
48249/// %2 = zext/sext <N x i16> %1 to <N x i32>
48250/// %4 = zext/sext <N x i16> %3 to <N x i32>
48251/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48252/// %5 = mul <N x i32> %2, %4
48253///
48254/// There are four mul shrinking modes:
48255/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
48256/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48257/// generate pmullw+sext32 for it (MULS8 mode).
48258/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
48259/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
48260/// generate pmullw+zext32 for it (MULU8 mode).
48261/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
48262/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48263/// generate pmullw+pmulhw for it (MULS16 mode).
48264/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
48265/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
48266/// generate pmullw+pmulhuw for it (MULU16 mode).
48267static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
48268 const X86Subtarget &Subtarget) {
48269 // Check for legality
48270 // pmullw/pmulhw are not supported by SSE.
48271 if (!Subtarget.hasSSE2())
48272 return SDValue();
48273
48274 // Check for profitability
48275 // pmulld is supported since SSE41. It is better to use pmulld
48276 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
48277 // the expansion.
48278 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
48279 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
48280 return SDValue();
48281
48282 ShrinkMode Mode;
48283 if (!canReduceVMulWidth(N, DAG, Mode))
48284 return SDValue();
48285
48286 SDLoc DL(N);
48287 SDValue N0 = N->getOperand(0);
48288 SDValue N1 = N->getOperand(1);
48289 EVT VT = N->getOperand(0).getValueType();
48290 unsigned NumElts = VT.getVectorNumElements();
48291 if ((NumElts % 2) != 0)
48292 return SDValue();
48293
48294 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
48295
48296 // Shrink the operands of mul.
48297 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
48298 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
48299
48300 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
48301 // lower part is needed.
48302 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48303 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48304 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48305 : ISD::SIGN_EXTEND,
48306 DL, VT, MulLo);
48307
48308 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48309 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48310 // the higher part is also needed.
48311 SDValue MulHi =
48312 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48313 ReducedVT, NewN0, NewN1);
48314
48315 // Repack the lower part and higher part result of mul into a wider
48316 // result.
48317 // Generate shuffle functioning as punpcklwd.
48318 SmallVector<int, 16> ShuffleMask(NumElts);
48319 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48320 ShuffleMask[2 * i] = i;
48321 ShuffleMask[2 * i + 1] = i + NumElts;
48322 }
48323 SDValue ResLo =
48324 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48325 ResLo = DAG.getBitcast(ResVT, ResLo);
48326 // Generate shuffle functioning as punpckhwd.
48327 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48328 ShuffleMask[2 * i] = i + NumElts / 2;
48329 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48330 }
48331 SDValue ResHi =
48332 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48333 ResHi = DAG.getBitcast(ResVT, ResHi);
48334 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48335}
48336
48337static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
48338 EVT VT, const SDLoc &DL) {
48339
48340 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48341 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48342 DAG.getConstant(Mult, DL, VT));
48343 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48344 DAG.getConstant(Shift, DL, MVT::i8));
48345 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48346 N->getOperand(0));
48347 return Result;
48348 };
48349
48350 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48351 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48352 DAG.getConstant(Mul1, DL, VT));
48353 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48354 DAG.getConstant(Mul2, DL, VT));
48355 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48356 N->getOperand(0));
48357 return Result;
48358 };
48359
48360 switch (MulAmt) {
48361 default:
48362 break;
48363 case 11:
48364 // mul x, 11 => add ((shl (mul x, 5), 1), x)
48365 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48366 case 21:
48367 // mul x, 21 => add ((shl (mul x, 5), 2), x)
48368 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48369 case 41:
48370 // mul x, 41 => add ((shl (mul x, 5), 3), x)
48371 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48372 case 22:
48373 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48374 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48375 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48376 case 19:
48377 // mul x, 19 => add ((shl (mul x, 9), 1), x)
48378 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48379 case 37:
48380 // mul x, 37 => add ((shl (mul x, 9), 2), x)
48381 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48382 case 73:
48383 // mul x, 73 => add ((shl (mul x, 9), 3), x)
48384 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48385 case 13:
48386 // mul x, 13 => add ((shl (mul x, 3), 2), x)
48387 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48388 case 23:
48389 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48390 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48391 case 26:
48392 // mul x, 26 => add ((mul (mul x, 5), 5), x)
48393 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48394 case 28:
48395 // mul x, 28 => add ((mul (mul x, 9), 3), x)
48396 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48397 case 29:
48398 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48399 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48400 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48401 }
48402
48403 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48404 // by a single LEA.
48405 // First check if this a sum of two power of 2s because that's easy. Then
48406 // count how many zeros are up to the first bit.
48407 // TODO: We can do this even without LEA at a cost of two shifts and an add.
48408 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48409 unsigned ScaleShift = llvm::countr_zero(MulAmt);
48410 if (ScaleShift >= 1 && ScaleShift < 4) {
48411 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48412 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48413 DAG.getConstant(ShiftAmt, DL, MVT::i8));
48414 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48415 DAG.getConstant(ScaleShift, DL, MVT::i8));
48416 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48417 }
48418 }
48419
48420 return SDValue();
48421}
48422
48423// If the upper 17 bits of either element are zero and the other element are
48424// zero/sign bits then we can use PMADDWD, which is always at least as quick as
48425// PMULLD, except on KNL.
48426static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
48427 const X86Subtarget &Subtarget) {
48428 if (!Subtarget.hasSSE2())
48429 return SDValue();
48430
48431 if (Subtarget.isPMADDWDSlow())
48432 return SDValue();
48433
48434 EVT VT = N->getValueType(0);
48435
48436 // Only support vXi32 vectors.
48437 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
48438 return SDValue();
48439
48440 // Make sure the type is legal or can split/widen to a legal type.
48441 // With AVX512 but without BWI, we would need to split v32i16.
48442 unsigned NumElts = VT.getVectorNumElements();
48443 if (NumElts == 1 || !isPowerOf2_32(NumElts))
48444 return SDValue();
48445
48446 // With AVX512 but without BWI, we would need to split v32i16.
48447 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
48448 return SDValue();
48449
48450 SDValue N0 = N->getOperand(0);
48451 SDValue N1 = N->getOperand(1);
48452
48453 // If we are zero/sign extending two steps without SSE4.1, its better to
48454 // reduce the vmul width instead.
48455 if (!Subtarget.hasSSE41() &&
48456 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
48457 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48458 (N1.getOpcode() == ISD::ZERO_EXTEND &&
48459 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
48460 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
48461 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48462 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48463 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
48464 return SDValue();
48465
48466 // If we are sign extending a wide vector without SSE4.1, its better to reduce
48467 // the vmul width instead.
48468 if (!Subtarget.hasSSE41() &&
48469 (N0.getOpcode() == ISD::SIGN_EXTEND &&
48470 N0.getOperand(0).getValueSizeInBits() > 128) &&
48471 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48472 N1.getOperand(0).getValueSizeInBits() > 128))
48473 return SDValue();
48474
48475 // Sign bits must extend down to the lowest i16.
48476 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
48477 DAG.ComputeMaxSignificantBits(N0) > 16)
48478 return SDValue();
48479
48480 // At least one of the elements must be zero in the upper 17 bits, or can be
48481 // safely made zero without altering the final result.
48482 auto GetZeroableOp = [&](SDValue Op) {
48483 APInt Mask17 = APInt::getHighBitsSet(32, 17);
48484 if (DAG.MaskedValueIsZero(Op, Mask17))
48485 return Op;
48486 // Mask off upper 16-bits of sign-extended constants.
48487 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
48488 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
48489 DAG.getConstant(0xFFFF, SDLoc(N), VT));
48490 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
48491 SDValue Src = Op.getOperand(0);
48492 // Convert sext(vXi16) to zext(vXi16).
48493 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
48494 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48495 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
48496 // which will expand the extension.
48497 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
48498 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
48499 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
48500 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48501 }
48502 }
48503 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
48504 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
48505 N->isOnlyUserOf(Op.getNode())) {
48506 SDValue Src = Op.getOperand(0);
48507 if (Src.getScalarValueSizeInBits() == 16)
48508 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
48509 }
48510 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
48511 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
48512 N->isOnlyUserOf(Op.getNode())) {
48513 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
48514 Op.getOperand(1));
48515 }
48516 return SDValue();
48517 };
48518 SDValue ZeroN0 = GetZeroableOp(N0);
48519 SDValue ZeroN1 = GetZeroableOp(N1);
48520 if (!ZeroN0 && !ZeroN1)
48521 return SDValue();
48522 N0 = ZeroN0 ? ZeroN0 : N0;
48523 N1 = ZeroN1 ? ZeroN1 : N1;
48524
48525 // Use SplitOpsAndApply to handle AVX splitting.
48526 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48527 ArrayRef<SDValue> Ops) {
48528 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
48529 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
48530 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
48531 DAG.getBitcast(OpVT, Ops[0]),
48532 DAG.getBitcast(OpVT, Ops[1]));
48533 };
48534 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
48535 PMADDWDBuilder);
48536}
48537
48538static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
48539 const X86Subtarget &Subtarget) {
48540 if (!Subtarget.hasSSE2())
48541 return SDValue();
48542
48543 EVT VT = N->getValueType(0);
48544
48545 // Only support vXi64 vectors.
48546 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
48547 VT.getVectorNumElements() < 2 ||
48548 !isPowerOf2_32(VT.getVectorNumElements()))
48549 return SDValue();
48550
48551 SDValue N0 = N->getOperand(0);
48552 SDValue N1 = N->getOperand(1);
48553
48554 // MULDQ returns the 64-bit result of the signed multiplication of the lower
48555 // 32-bits. We can lower with this if the sign bits stretch that far.
48556 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
48557 DAG.ComputeNumSignBits(N1) > 32) {
48558 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48559 ArrayRef<SDValue> Ops) {
48560 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
48561 };
48562 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48563 PMULDQBuilder, /*CheckBWI*/false);
48564 }
48565
48566 // If the upper bits are zero we can use a single pmuludq.
48567 APInt Mask = APInt::getHighBitsSet(64, 32);
48568 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
48569 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48570 ArrayRef<SDValue> Ops) {
48571 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
48572 };
48573 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48574 PMULUDQBuilder, /*CheckBWI*/false);
48575 }
48576
48577 return SDValue();
48578}
48579
48580static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
48581 TargetLowering::DAGCombinerInfo &DCI,
48582 const X86Subtarget &Subtarget) {
48583 EVT VT = N->getValueType(0);
48584
48585 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
48586 return V;
48587
48588 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
48589 return V;
48590
48591 if (DCI.isBeforeLegalize() && VT.isVector())
48592 return reduceVMULWidth(N, DAG, Subtarget);
48593
48594 // Optimize a single multiply with constant into two operations in order to
48595 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
48596 if (!MulConstantOptimization)
48597 return SDValue();
48598
48599 // An imul is usually smaller than the alternative sequence.
48600 if (DAG.getMachineFunction().getFunction().hasMinSize())
48601 return SDValue();
48602
48603 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
48604 return SDValue();
48605
48606 if (VT != MVT::i64 && VT != MVT::i32)
48607 return SDValue();
48608
48609 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
48610 if (!C)
48611 return SDValue();
48612 if (isPowerOf2_64(C->getZExtValue()))
48613 return SDValue();
48614
48615 int64_t SignMulAmt = C->getSExtValue();
48616 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48616, __extension__
__PRETTY_FUNCTION__))
;
48617 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
48618
48619 SDLoc DL(N);
48620 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
48621 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48622 DAG.getConstant(AbsMulAmt, DL, VT));
48623 if (SignMulAmt < 0)
48624 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48625 NewMul);
48626
48627 return NewMul;
48628 }
48629
48630 uint64_t MulAmt1 = 0;
48631 uint64_t MulAmt2 = 0;
48632 if ((AbsMulAmt % 9) == 0) {
48633 MulAmt1 = 9;
48634 MulAmt2 = AbsMulAmt / 9;
48635 } else if ((AbsMulAmt % 5) == 0) {
48636 MulAmt1 = 5;
48637 MulAmt2 = AbsMulAmt / 5;
48638 } else if ((AbsMulAmt % 3) == 0) {
48639 MulAmt1 = 3;
48640 MulAmt2 = AbsMulAmt / 3;
48641 }
48642
48643 SDValue NewMul;
48644 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
48645 if (MulAmt2 &&
48646 (isPowerOf2_64(MulAmt2) ||
48647 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
48648
48649 if (isPowerOf2_64(MulAmt2) &&
48650 !(SignMulAmt >= 0 && N->hasOneUse() &&
48651 N->use_begin()->getOpcode() == ISD::ADD))
48652 // If second multiplifer is pow2, issue it first. We want the multiply by
48653 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
48654 // is an add. Only do this for positive multiply amounts since the
48655 // negate would prevent it from being used as an address mode anyway.
48656 std::swap(MulAmt1, MulAmt2);
48657
48658 if (isPowerOf2_64(MulAmt1))
48659 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48660 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
48661 else
48662 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48663 DAG.getConstant(MulAmt1, DL, VT));
48664
48665 if (isPowerOf2_64(MulAmt2))
48666 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
48667 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
48668 else
48669 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
48670 DAG.getConstant(MulAmt2, DL, VT));
48671
48672 // Negate the result.
48673 if (SignMulAmt < 0)
48674 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48675 NewMul);
48676 } else if (!Subtarget.slowLEA())
48677 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
48678
48679 if (!NewMul) {
48680 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
48681 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
48682 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
48683 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
;
48684 if (isPowerOf2_64(AbsMulAmt - 1)) {
48685 // (mul x, 2^N + 1) => (add (shl x, N), x)
48686 NewMul = DAG.getNode(
48687 ISD::ADD, DL, VT, N->getOperand(0),
48688 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48689 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
48690 MVT::i8)));
48691 // To negate, subtract the number from zero
48692 if (SignMulAmt < 0)
48693 NewMul = DAG.getNode(ISD::SUB, DL, VT,
48694 DAG.getConstant(0, DL, VT), NewMul);
48695 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
48696 // (mul x, 2^N - 1) => (sub (shl x, N), x)
48697 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48698 DAG.getConstant(Log2_64(AbsMulAmt + 1),
48699 DL, MVT::i8));
48700 // To negate, reverse the operands of the subtract.
48701 if (SignMulAmt < 0)
48702 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
48703 else
48704 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
48705 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
48706 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
48707 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48708 DAG.getConstant(Log2_64(AbsMulAmt - 2),
48709 DL, MVT::i8));
48710 NewMul = DAG.getNode(
48711 ISD::ADD, DL, VT, NewMul,
48712 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48713 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
48714 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
48715 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48716 DAG.getConstant(Log2_64(AbsMulAmt + 2),
48717 DL, MVT::i8));
48718 NewMul = DAG.getNode(
48719 ISD::SUB, DL, VT, NewMul,
48720 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48721 }
48722 }
48723
48724 return NewMul;
48725}
48726
48727// Try to form a MULHU or MULHS node by looking for
48728// (srl (mul ext, ext), 16)
48729// TODO: This is X86 specific because we want to be able to handle wide types
48730// before type legalization. But we can only do it if the vector will be
48731// legalized via widening/splitting. Type legalization can't handle promotion
48732// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
48733// combiner.
48734static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
48735 const X86Subtarget &Subtarget) {
48736 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__
__PRETTY_FUNCTION__))
48737 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__
__PRETTY_FUNCTION__))
;
48738 SDLoc DL(N);
48739
48740 if (!Subtarget.hasSSE2())
48741 return SDValue();
48742
48743 // The operation feeding into the shift must be a multiply.
48744 SDValue ShiftOperand = N->getOperand(0);
48745 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
48746 return SDValue();
48747
48748 // Input type should be at least vXi32.
48749 EVT VT = N->getValueType(0);
48750 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
48751 return SDValue();
48752
48753 // Need a shift by 16.
48754 APInt ShiftAmt;
48755 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
48756 ShiftAmt != 16)
48757 return SDValue();
48758
48759 SDValue LHS = ShiftOperand.getOperand(0);
48760 SDValue RHS = ShiftOperand.getOperand(1);
48761
48762 unsigned ExtOpc = LHS.getOpcode();
48763 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
48764 RHS.getOpcode() != ExtOpc)
48765 return SDValue();
48766
48767 // Peek through the extends.
48768 LHS = LHS.getOperand(0);
48769 RHS = RHS.getOperand(0);
48770
48771 // Ensure the input types match.
48772 EVT MulVT = LHS.getValueType();
48773 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
48774 return SDValue();
48775
48776 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
48777 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
48778
48779 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48780 return DAG.getNode(ExtOpc, DL, VT, Mulh);
48781}
48782
48783static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
48784 SDValue N0 = N->getOperand(0);
48785 SDValue N1 = N->getOperand(1);
48786 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
48787 EVT VT = N0.getValueType();
48788
48789 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
48790 // since the result of setcc_c is all zero's or all ones.
48791 if (VT.isInteger() && !VT.isVector() &&
48792 N1C && N0.getOpcode() == ISD::AND &&
48793 N0.getOperand(1).getOpcode() == ISD::Constant) {
48794 SDValue N00 = N0.getOperand(0);
48795 APInt Mask = N0.getConstantOperandAPInt(1);
48796 Mask <<= N1C->getAPIntValue();
48797 bool MaskOK = false;
48798 // We can handle cases concerning bit-widening nodes containing setcc_c if
48799 // we carefully interrogate the mask to make sure we are semantics
48800 // preserving.
48801 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
48802 // of the underlying setcc_c operation if the setcc_c was zero extended.
48803 // Consider the following example:
48804 // zext(setcc_c) -> i32 0x0000FFFF
48805 // c1 -> i32 0x0000FFFF
48806 // c2 -> i32 0x00000001
48807 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
48808 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
48809 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
48810 MaskOK = true;
48811 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
48812 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48813 MaskOK = true;
48814 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
48815 N00.getOpcode() == ISD::ANY_EXTEND) &&
48816 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48817 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
48818 }
48819 if (MaskOK && Mask != 0) {
48820 SDLoc DL(N);
48821 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
48822 }
48823 }
48824
48825 return SDValue();
48826}
48827
48828static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
48829 const X86Subtarget &Subtarget) {
48830 SDValue N0 = N->getOperand(0);
48831 SDValue N1 = N->getOperand(1);
48832 EVT VT = N0.getValueType();
48833 unsigned Size = VT.getSizeInBits();
48834
48835 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48836 return V;
48837
48838 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
48839 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
48840 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
48841 // depending on sign of (SarConst - [56,48,32,24,16])
48842
48843 // sexts in X86 are MOVs. The MOVs have the same code size
48844 // as above SHIFTs (only SHIFT on 1 has lower code size).
48845 // However the MOVs have 2 advantages to a SHIFT:
48846 // 1. MOVs can write to a register that differs from source
48847 // 2. MOVs accept memory operands
48848
48849 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
48850 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
48851 N0.getOperand(1).getOpcode() != ISD::Constant)
48852 return SDValue();
48853
48854 SDValue N00 = N0.getOperand(0);
48855 SDValue N01 = N0.getOperand(1);
48856 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
48857 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
48858 EVT CVT = N1.getValueType();
48859
48860 if (SarConst.isNegative())
48861 return SDValue();
48862
48863 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
48864 unsigned ShiftSize = SVT.getSizeInBits();
48865 // skipping types without corresponding sext/zext and
48866 // ShlConst that is not one of [56,48,32,24,16]
48867 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
48868 continue;
48869 SDLoc DL(N);
48870 SDValue NN =
48871 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
48872 SarConst = SarConst - (Size - ShiftSize);
48873 if (SarConst == 0)
48874 return NN;
48875 if (SarConst.isNegative())
48876 return DAG.getNode(ISD::SHL, DL, VT, NN,
48877 DAG.getConstant(-SarConst, DL, CVT));
48878 return DAG.getNode(ISD::SRA, DL, VT, NN,
48879 DAG.getConstant(SarConst, DL, CVT));
48880 }
48881 return SDValue();
48882}
48883
48884static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
48885 TargetLowering::DAGCombinerInfo &DCI,
48886 const X86Subtarget &Subtarget) {
48887 SDValue N0 = N->getOperand(0);
48888 SDValue N1 = N->getOperand(1);
48889 EVT VT = N0.getValueType();
48890
48891 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48892 return V;
48893
48894 // Only do this on the last DAG combine as it can interfere with other
48895 // combines.
48896 if (!DCI.isAfterLegalizeDAG())
48897 return SDValue();
48898
48899 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
48900 // TODO: This is a generic DAG combine that became an x86-only combine to
48901 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
48902 // and-not ('andn').
48903 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
48904 return SDValue();
48905
48906 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
48907 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
48908 if (!ShiftC || !AndC)
48909 return SDValue();
48910
48911 // If we can shrink the constant mask below 8-bits or 32-bits, then this
48912 // transform should reduce code size. It may also enable secondary transforms
48913 // from improved known-bits analysis or instruction selection.
48914 APInt MaskVal = AndC->getAPIntValue();
48915
48916 // If this can be matched by a zero extend, don't optimize.
48917 if (MaskVal.isMask()) {
48918 unsigned TO = MaskVal.countr_one();
48919 if (TO >= 8 && isPowerOf2_32(TO))
48920 return SDValue();
48921 }
48922
48923 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
48924 unsigned OldMaskSize = MaskVal.getSignificantBits();
48925 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
48926 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
48927 (OldMaskSize > 32 && NewMaskSize <= 32)) {
48928 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
48929 SDLoc DL(N);
48930 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
48931 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
48932 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
48933 }
48934 return SDValue();
48935}
48936
48937static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
48938 const X86Subtarget &Subtarget) {
48939 unsigned Opcode = N->getOpcode();
48940 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48940, __extension__
__PRETTY_FUNCTION__))
;
48941
48942 SDLoc DL(N);
48943 EVT VT = N->getValueType(0);
48944 SDValue N0 = N->getOperand(0);
48945 SDValue N1 = N->getOperand(1);
48946 EVT SrcVT = N0.getValueType();
48947
48948 SDValue BC0 =
48949 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
48950 SDValue BC1 =
48951 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
48952
48953 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
48954 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
48955 // truncation trees that help us avoid lane crossing shuffles.
48956 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
48957 // TODO: We don't handle vXf64 shuffles yet.
48958 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48959 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
48960 SmallVector<SDValue> ShuffleOps;
48961 SmallVector<int> ShuffleMask, ScaledMask;
48962 SDValue Vec = peekThroughBitcasts(BCSrc);
48963 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
48964 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
48965 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
48966 // shuffle to a v4X64 width - we can probably relax this in the future.
48967 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
48968 ShuffleOps[0].getValueType().is256BitVector() &&
48969 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
48970 SDValue Lo, Hi;
48971 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48972 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
48973 Lo = DAG.getBitcast(SrcVT, Lo);
48974 Hi = DAG.getBitcast(SrcVT, Hi);
48975 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
48976 Res = DAG.getBitcast(ShufVT, Res);
48977 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
48978 return DAG.getBitcast(VT, Res);
48979 }
48980 }
48981 }
48982 }
48983
48984 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
48985 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48986 // If either/both ops are a shuffle that can scale to v2x64,
48987 // then see if we can perform this as a v4x32 post shuffle.
48988 SmallVector<SDValue> Ops0, Ops1;
48989 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
48990 bool IsShuf0 =
48991 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48992 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48993 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48994 bool IsShuf1 =
48995 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48996 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
48997 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48998 if (IsShuf0 || IsShuf1) {
48999 if (!IsShuf0) {
49000 Ops0.assign({BC0});
49001 ScaledMask0.assign({0, 1});
49002 }
49003 if (!IsShuf1) {
49004 Ops1.assign({BC1});
49005 ScaledMask1.assign({0, 1});
49006 }
49007
49008 SDValue LHS, RHS;
49009 int PostShuffle[4] = {-1, -1, -1, -1};
49010 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
49011 if (M < 0)
49012 return true;
49013 Idx = M % 2;
49014 SDValue Src = Ops[M / 2];
49015 if (!LHS || LHS == Src) {
49016 LHS = Src;
49017 return true;
49018 }
49019 if (!RHS || RHS == Src) {
49020 Idx += 2;
49021 RHS = Src;
49022 return true;
49023 }
49024 return false;
49025 };
49026 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
49027 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
49028 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
49029 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
49030 LHS = DAG.getBitcast(SrcVT, LHS);
49031 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
49032 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49033 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
49034 Res = DAG.getBitcast(ShufVT, Res);
49035 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
49036 return DAG.getBitcast(VT, Res);
49037 }
49038 }
49039 }
49040
49041 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49042 if (VT.is256BitVector() && Subtarget.hasInt256()) {
49043 SmallVector<int> Mask0, Mask1;
49044 SmallVector<SDValue> Ops0, Ops1;
49045 SmallVector<int, 2> ScaledMask0, ScaledMask1;
49046 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49047 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49048 !Ops0.empty() && !Ops1.empty() &&
49049 all_of(Ops0,
49050 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49051 all_of(Ops1,
49052 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49053 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49054 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
49055 SDValue Op00 = peekThroughBitcasts(Ops0.front());
49056 SDValue Op10 = peekThroughBitcasts(Ops1.front());
49057 SDValue Op01 = peekThroughBitcasts(Ops0.back());
49058 SDValue Op11 = peekThroughBitcasts(Ops1.back());
49059 if ((Op00 == Op11) && (Op01 == Op10)) {
49060 std::swap(Op10, Op11);
49061 ShuffleVectorSDNode::commuteMask(ScaledMask1);
49062 }
49063 if ((Op00 == Op10) && (Op01 == Op11)) {
49064 const int Map[4] = {0, 2, 1, 3};
49065 SmallVector<int, 4> ShuffleMask(
49066 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
49067 Map[ScaledMask1[1]]});
49068 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
49069 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
49070 DAG.getBitcast(SrcVT, Op01));
49071 Res = DAG.getBitcast(ShufVT, Res);
49072 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
49073 return DAG.getBitcast(VT, Res);
49074 }
49075 }
49076 }
49077
49078 return SDValue();
49079}
49080
49081static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
49082 TargetLowering::DAGCombinerInfo &DCI,
49083 const X86Subtarget &Subtarget) {
49084 unsigned Opcode = N->getOpcode();
49085 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
49086 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
;
49087
49088 EVT VT = N->getValueType(0);
49089 SDValue N0 = N->getOperand(0);
49090 SDValue N1 = N->getOperand(1);
49091 unsigned NumDstElts = VT.getVectorNumElements();
49092 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
49093 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
49094 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))
49095 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))
49096 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))
;
49097
49098 bool IsSigned = (X86ISD::PACKSS == Opcode);
49099
49100 // Constant Folding.
49101 APInt UndefElts0, UndefElts1;
49102 SmallVector<APInt, 32> EltBits0, EltBits1;
49103 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49104 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49105 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
49106 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
49107 unsigned NumLanes = VT.getSizeInBits() / 128;
49108 unsigned NumSrcElts = NumDstElts / 2;
49109 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
49110 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
49111
49112 APInt Undefs(NumDstElts, 0);
49113 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
49114 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
49115 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
49116 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
49117 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
49118 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
49119
49120 if (UndefElts[SrcIdx]) {
49121 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
49122 continue;
49123 }
49124
49125 APInt &Val = EltBits[SrcIdx];
49126 if (IsSigned) {
49127 // PACKSS: Truncate signed value with signed saturation.
49128 // Source values less than dst minint are saturated to minint.
49129 // Source values greater than dst maxint are saturated to maxint.
49130 if (Val.isSignedIntN(DstBitsPerElt))
49131 Val = Val.trunc(DstBitsPerElt);
49132 else if (Val.isNegative())
49133 Val = APInt::getSignedMinValue(DstBitsPerElt);
49134 else
49135 Val = APInt::getSignedMaxValue(DstBitsPerElt);
49136 } else {
49137 // PACKUS: Truncate signed value with unsigned saturation.
49138 // Source values less than zero are saturated to zero.
49139 // Source values greater than dst maxuint are saturated to maxuint.
49140 if (Val.isIntN(DstBitsPerElt))
49141 Val = Val.trunc(DstBitsPerElt);
49142 else if (Val.isNegative())
49143 Val = APInt::getZero(DstBitsPerElt);
49144 else
49145 Val = APInt::getAllOnes(DstBitsPerElt);
49146 }
49147 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
49148 }
49149 }
49150
49151 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
49152 }
49153
49154 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49155 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49156 return V;
49157
49158 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
49159 // truncate to create a larger truncate.
49160 if (Subtarget.hasAVX512() &&
49161 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
49162 N0.getOperand(0).getValueType() == MVT::v8i32) {
49163 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
49164 (!IsSigned &&
49165 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
49166 if (Subtarget.hasVLX())
49167 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
49168
49169 // Widen input to v16i32 so we can truncate that.
49170 SDLoc dl(N);
49171 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
49172 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
49173 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
49174 }
49175 }
49176
49177 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49178 if (VT.is128BitVector()) {
49179 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49180 SDValue Src0, Src1;
49181 if (N0.getOpcode() == ExtOpc &&
49182 N0.getOperand(0).getValueType().is64BitVector() &&
49183 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49184 Src0 = N0.getOperand(0);
49185 }
49186 if (N1.getOpcode() == ExtOpc &&
49187 N1.getOperand(0).getValueType().is64BitVector() &&
49188 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49189 Src1 = N1.getOperand(0);
49190 }
49191 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
49192 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49192, __extension__
__PRETTY_FUNCTION__))
;
49193 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
49194 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
49195 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
49196 }
49197
49198 // Try again with pack(*_extend_vector_inreg, undef).
49199 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
49200 : ISD::ZERO_EXTEND_VECTOR_INREG;
49201 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
49202 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
49203 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
49204 DAG);
49205 }
49206
49207 // Attempt to combine as shuffle.
49208 SDValue Op(N, 0);
49209 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49210 return Res;
49211
49212 return SDValue();
49213}
49214
49215static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
49216 TargetLowering::DAGCombinerInfo &DCI,
49217 const X86Subtarget &Subtarget) {
49218 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))
49219 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))
49220 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))
;
49221
49222 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
49223 MVT VT = N->getSimpleValueType(0);
49224 SDValue LHS = N->getOperand(0);
49225 SDValue RHS = N->getOperand(1);
49226
49227 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49228 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49229 LHS.getOpcode() == RHS.getOpcode() &&
49230 LHS.getValueType() == RHS.getValueType() &&
49231 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49232 SDValue LHS0 = LHS.getOperand(0);
49233 SDValue LHS1 = LHS.getOperand(1);
49234 SDValue RHS0 = RHS.getOperand(0);
49235 SDValue RHS1 = RHS.getOperand(1);
49236 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
49237 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
49238 SDLoc DL(N);
49239 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
49240 LHS0.isUndef() ? LHS1 : LHS0,
49241 RHS0.isUndef() ? RHS1 : RHS0);
49242 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
49243 Res = DAG.getBitcast(ShufVT, Res);
49244 SDValue NewLHS =
49245 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49246 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
49247 SDValue NewRHS =
49248 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49249 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
49250 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49251 DAG.getBitcast(VT, NewRHS));
49252 }
49253 }
49254 }
49255
49256 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49257 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49258 return V;
49259
49260 return SDValue();
49261}
49262
49263static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
49264 TargetLowering::DAGCombinerInfo &DCI,
49265 const X86Subtarget &Subtarget) {
49266 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))
49267 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))
49268 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))
;
49269 EVT VT = N->getValueType(0);
49270 SDValue N0 = N->getOperand(0);
49271 SDValue N1 = N->getOperand(1);
49272
49273 // Shift zero -> zero.
49274 if (ISD::isBuildVectorAllZeros(N0.getNode()))
49275 return DAG.getConstant(0, SDLoc(N), VT);
49276
49277 // Detect constant shift amounts.
49278 APInt UndefElts;
49279 SmallVector<APInt, 32> EltBits;
49280 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
49281 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49282 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
49283 EltBits[0].getZExtValue(), DAG);
49284 }
49285
49286 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49287 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
49288 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
49289 return SDValue(N, 0);
49290
49291 return SDValue();
49292}
49293
49294static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
49295 TargetLowering::DAGCombinerInfo &DCI,
49296 const X86Subtarget &Subtarget) {
49297 unsigned Opcode = N->getOpcode();
49298 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))
49299 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))
49300 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))
;
49301 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49302 EVT VT = N->getValueType(0);
49303 SDValue N0 = N->getOperand(0);
49304 SDValue N1 = N->getOperand(1);
49305 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49306 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__
__PRETTY_FUNCTION__))
49307 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__
__PRETTY_FUNCTION__))
;
49308 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 &&
"Unexpected shift amount type") ? void (0) : __assert_fail (
"N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49308, __extension__
__PRETTY_FUNCTION__))
;
49309
49310 // (shift undef, X) -> 0
49311 if (N0.isUndef())
49312 return DAG.getConstant(0, SDLoc(N), VT);
49313
49314 // Out of range logical bit shifts are guaranteed to be zero.
49315 // Out of range arithmetic bit shifts splat the sign bit.
49316 unsigned ShiftVal = N->getConstantOperandVal(1);
49317 if (ShiftVal >= NumBitsPerElt) {
49318 if (LogicalShift)
49319 return DAG.getConstant(0, SDLoc(N), VT);
49320 ShiftVal = NumBitsPerElt - 1;
49321 }
49322
49323 // (shift X, 0) -> X
49324 if (!ShiftVal)
49325 return N0;
49326
49327 // (shift 0, C) -> 0
49328 if (ISD::isBuildVectorAllZeros(N0.getNode()))
49329 // N0 is all zeros or undef. We guarantee that the bits shifted into the
49330 // result are all zeros, not undef.
49331 return DAG.getConstant(0, SDLoc(N), VT);
49332
49333 // (VSRAI -1, C) -> -1
49334 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
49335 // N0 is all ones or undef. We guarantee that the bits shifted into the
49336 // result are all ones, not undef.
49337 return DAG.getConstant(-1, SDLoc(N), VT);
49338
49339 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
49340 unsigned NewShiftVal = Amt0 + Amt1;
49341 if (NewShiftVal >= NumBitsPerElt) {
49342 // Out of range logical bit shifts are guaranteed to be zero.
49343 // Out of range arithmetic bit shifts splat the sign bit.
49344 if (LogicalShift)
49345 return DAG.getConstant(0, SDLoc(N), VT);
49346 NewShiftVal = NumBitsPerElt - 1;
49347 }
49348 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
49349 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
49350 };
49351
49352 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49353 if (Opcode == N0.getOpcode())
49354 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
49355
49356 // (shl (add X, X), C) -> (shl X, (C + 1))
49357 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
49358 N0.getOperand(0) == N0.getOperand(1))
49359 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
49360
49361 // We can decode 'whole byte' logical bit shifts as shuffles.
49362 if (LogicalShift && (ShiftVal % 8) == 0) {
49363 SDValue Op(N, 0);
49364 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49365 return Res;
49366 }
49367
49368 auto TryConstantFold = [&](SDValue V) {
49369 APInt UndefElts;
49370 SmallVector<APInt, 32> EltBits;
49371 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))
49372 return SDValue();
49373 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__
__PRETTY_FUNCTION__))
49374 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__
__PRETTY_FUNCTION__))
;
49375 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
49376 // created an undef input due to no input bits being demanded, but user
49377 // still expects 0 in other bits.
49378 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
49379 APInt &Elt = EltBits[i];
49380 if (UndefElts[i])
49381 Elt = 0;
49382 else if (X86ISD::VSHLI == Opcode)
49383 Elt <<= ShiftVal;
49384 else if (X86ISD::VSRAI == Opcode)
49385 Elt.ashrInPlace(ShiftVal);
49386 else
49387 Elt.lshrInPlace(ShiftVal);
49388 }
49389 // Reset undef elements since they were zeroed above.
49390 UndefElts = 0;
49391 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
49392 };
49393
49394 // Constant Folding.
49395 if (N->isOnlyUserOf(N0.getNode())) {
49396 if (SDValue C = TryConstantFold(N0))
49397 return C;
49398
49399 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
49400 // Don't break NOT patterns.
49401 SDValue BC = peekThroughOneUseBitcasts(N0);
49402 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
49403 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
49404 !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {
49405 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
49406 SDLoc DL(N);
49407 SDValue LHS = DAG.getNode(Opcode, DL, VT,
49408 DAG.getBitcast(VT, BC.getOperand(0)), N1);
49409 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
49410 }
49411 }
49412 }
49413
49414 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49415 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
49416 DCI))
49417 return SDValue(N, 0);
49418
49419 return SDValue();
49420}
49421
49422static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
49423 TargetLowering::DAGCombinerInfo &DCI,
49424 const X86Subtarget &Subtarget) {
49425 EVT VT = N->getValueType(0);
49426 unsigned Opcode = N->getOpcode();
49427 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
49428 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
49429 Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
49430 "Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))
;
49431
49432 SDValue Vec = N->getOperand(0);
49433 SDValue Scl = N->getOperand(1);
49434 SDValue Idx = N->getOperand(2);
49435
49436 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
49437 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
49438 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
49439
49440 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
49441 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49443 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
49444 APInt::getAllOnes(NumBitsPerElt), DCI))
49445 return SDValue(N, 0);
49446 }
49447
49448 // Attempt to combine insertion patterns to a shuffle.
49449 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
49450 SDValue Op(N, 0);
49451 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49452 return Res;
49453 }
49454
49455 return SDValue();
49456}
49457
49458/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
49459/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
49460/// OR -> CMPNEQSS.
49461static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
49462 TargetLowering::DAGCombinerInfo &DCI,
49463 const X86Subtarget &Subtarget) {
49464 unsigned opcode;
49465
49466 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
49467 // we're requiring SSE2 for both.
49468 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
49469 SDValue N0 = N->getOperand(0);
49470 SDValue N1 = N->getOperand(1);
49471 SDValue CMP0 = N0.getOperand(1);
49472 SDValue CMP1 = N1.getOperand(1);
49473 SDLoc DL(N);
49474
49475 // The SETCCs should both refer to the same CMP.
49476 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
49477 return SDValue();
49478
49479 SDValue CMP00 = CMP0->getOperand(0);
49480 SDValue CMP01 = CMP0->getOperand(1);
49481 EVT VT = CMP00.getValueType();
49482
49483 if (VT == MVT::f32 || VT == MVT::f64 ||
49484 (VT == MVT::f16 && Subtarget.hasFP16())) {
49485 bool ExpectingFlags = false;
49486 // Check for any users that want flags:
49487 for (const SDNode *U : N->uses()) {
49488 if (ExpectingFlags)
49489 break;
49490
49491 switch (U->getOpcode()) {
49492 default:
49493 case ISD::BR_CC:
49494 case ISD::BRCOND:
49495 case ISD::SELECT:
49496 ExpectingFlags = true;
49497 break;
49498 case ISD::CopyToReg:
49499 case ISD::SIGN_EXTEND:
49500 case ISD::ZERO_EXTEND:
49501 case ISD::ANY_EXTEND:
49502 break;
49503 }
49504 }
49505
49506 if (!ExpectingFlags) {
49507 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
49508 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
49509
49510 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
49511 X86::CondCode tmp = cc0;
49512 cc0 = cc1;
49513 cc1 = tmp;
49514 }
49515
49516 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
49517 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
49518 // FIXME: need symbolic constants for these magic numbers.
49519 // See X86ATTInstPrinter.cpp:printSSECC().
49520 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
49521 if (Subtarget.hasAVX512()) {
49522 SDValue FSetCC =
49523 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
49524 DAG.getTargetConstant(x86cc, DL, MVT::i8));
49525 // Need to fill with zeros to ensure the bitcast will produce zeroes
49526 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
49527 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
49528 DAG.getConstant(0, DL, MVT::v16i1),
49529 FSetCC, DAG.getIntPtrConstant(0, DL));
49530 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
49531 N->getSimpleValueType(0));
49532 }
49533 SDValue OnesOrZeroesF =
49534 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
49535 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
49536
49537 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
49538 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
49539
49540 if (is64BitFP && !Subtarget.is64Bit()) {
49541 // On a 32-bit target, we cannot bitcast the 64-bit float to a
49542 // 64-bit integer, since that's not a legal type. Since
49543 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
49544 // bits, but can do this little dance to extract the lowest 32 bits
49545 // and work with those going forward.
49546 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
49547 OnesOrZeroesF);
49548 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
49549 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
49550 Vector32, DAG.getIntPtrConstant(0, DL));
49551 IntVT = MVT::i32;
49552 }
49553
49554 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
49555 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
49556 DAG.getConstant(1, DL, IntVT));
49557 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
49558 ANDed);
49559 return OneBitOfTruth;
49560 }
49561 }
49562 }
49563 }
49564 return SDValue();
49565}
49566
49567/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
49568static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
49569 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49569, __extension__
__PRETTY_FUNCTION__))
;
49570
49571 MVT VT = N->getSimpleValueType(0);
49572 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
49573 return SDValue();
49574
49575 SDValue X, Y;
49576 SDValue N0 = N->getOperand(0);
49577 SDValue N1 = N->getOperand(1);
49578
49579 if (SDValue Not = IsNOT(N0, DAG)) {
49580 X = Not;
49581 Y = N1;
49582 } else if (SDValue Not = IsNOT(N1, DAG)) {
49583 X = Not;
49584 Y = N0;
49585 } else
49586 return SDValue();
49587
49588 X = DAG.getBitcast(VT, X);
49589 Y = DAG.getBitcast(VT, Y);
49590 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
49591}
49592
49593/// Try to fold:
49594/// and (vector_shuffle<Z,...,Z>
49595/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
49596/// ->
49597/// andnp (vector_shuffle<Z,...,Z>
49598/// (insert_vector_elt undef, X, Z), undef), Y
49599static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
49600 const X86Subtarget &Subtarget) {
49601 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49601, __extension__
__PRETTY_FUNCTION__))
;
49602
49603 EVT VT = N->getValueType(0);
49604 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
49605 // value and require extra moves.
49606 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49607 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
49608 return SDValue();
49609
49610 auto GetNot = [&DAG](SDValue V) {
49611 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
49612 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
49613 // end-users are ISD::AND including cases
49614 // (and(extract_vector_element(SVN), Y)).
49615 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
49616 !SVN->getOperand(1).isUndef()) {
49617 return SDValue();
49618 }
49619 SDValue IVEN = SVN->getOperand(0);
49620 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
49621 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
49622 return SDValue();
49623 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
49624 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
49625 return SDValue();
49626 SDValue Src = IVEN.getOperand(1);
49627 if (SDValue Not = IsNOT(Src, DAG)) {
49628 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
49629 SDValue NotIVEN =
49630 DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
49631 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
49632 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
49633 SVN->getOperand(1), SVN->getMask());
49634 }
49635 return SDValue();
49636 };
49637
49638 SDValue X, Y;
49639 SDValue N0 = N->getOperand(0);
49640 SDValue N1 = N->getOperand(1);
49641
49642 if (SDValue Not = GetNot(N0)) {
49643 X = Not;
49644 Y = N1;
49645 } else if (SDValue Not = GetNot(N1)) {
49646 X = Not;
49647 Y = N0;
49648 } else
49649 return SDValue();
49650
49651 X = DAG.getBitcast(VT, X);
49652 Y = DAG.getBitcast(VT, Y);
49653 SDLoc DL(N);
49654 // We do not split for SSE at all, but we need to split vectors for AVX1 and
49655 // AVX2.
49656 if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
49657 SDValue LoX, HiX;
49658 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
49659 SDValue LoY, HiY;
49660 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
49661 EVT SplitVT = LoX.getValueType();
49662 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
49663 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
49664 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
49665 }
49666 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
49667}
49668
49669// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
49670// logical operations, like in the example below.
49671// or (and (truncate x, truncate y)),
49672// (xor (truncate z, build_vector (constants)))
49673// Given a target type \p VT, we generate
49674// or (and x, y), (xor z, zext(build_vector (constants)))
49675// given x, y and z are of type \p VT. We can do so, if operands are either
49676// truncates from VT types, the second operand is a vector of constants or can
49677// be recursively promoted.
49678static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
49679 unsigned Depth) {
49680 // Limit recursion to avoid excessive compile times.
49681 if (Depth >= SelectionDAG::MaxRecursionDepth)
49682 return SDValue();
49683
49684 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
49685 N->getOpcode() != ISD::OR)
49686 return SDValue();
49687
49688 SDValue N0 = N->getOperand(0);
49689 SDValue N1 = N->getOperand(1);
49690 SDLoc DL(N);
49691
49692 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49693 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
49694 return SDValue();
49695
49696 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
49697 N0 = NN0;
49698 else {
49699 // The Left side has to be a trunc.
49700 if (N0.getOpcode() != ISD::TRUNCATE)
49701 return SDValue();
49702
49703 // The type of the truncated inputs.
49704 if (N0.getOperand(0).getValueType() != VT)
49705 return SDValue();
49706
49707 N0 = N0.getOperand(0);
49708 }
49709
49710 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
49711 N1 = NN1;
49712 else {
49713 // The right side has to be a 'trunc' or a constant vector.
49714 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
49715 N1.getOperand(0).getValueType() == VT;
49716 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
49717 return SDValue();
49718
49719 if (RHSTrunc)
49720 N1 = N1.getOperand(0);
49721 else
49722 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
49723 }
49724
49725 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
49726}
49727
49728// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
49729// register. In most cases we actually compare or select YMM-sized registers
49730// and mixing the two types creates horrible code. This method optimizes
49731// some of the transition sequences.
49732// Even with AVX-512 this is still useful for removing casts around logical
49733// operations on vXi1 mask types.
49734static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
49735 const X86Subtarget &Subtarget) {
49736 EVT VT = N->getValueType(0);
49737 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49737, __extension__
__PRETTY_FUNCTION__))
;
49738
49739 SDLoc DL(N);
49740 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))
49741 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))
49742 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))
;
49743
49744 SDValue Narrow = N->getOperand(0);
49745 EVT NarrowVT = Narrow.getValueType();
49746
49747 // Generate the wide operation.
49748 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
49749 if (!Op)
49750 return SDValue();
49751 switch (N->getOpcode()) {
49752 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49752)
;
49753 case ISD::ANY_EXTEND:
49754 return Op;
49755 case ISD::ZERO_EXTEND:
49756 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
49757 case ISD::SIGN_EXTEND:
49758 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
49759 Op, DAG.getValueType(NarrowVT));
49760 }
49761}
49762
49763static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
49764 unsigned FPOpcode;
49765 switch (Opcode) {
49766 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49766)
;
49767 case ISD::AND: FPOpcode = X86ISD::FAND; break;
49768 case ISD::OR: FPOpcode = X86ISD::FOR; break;
49769 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
49770 }
49771 return FPOpcode;
49772}
49773
49774/// If both input operands of a logic op are being cast from floating-point
49775/// types or FP compares, try to convert this into a floating-point logic node
49776/// to avoid unnecessary moves from SSE to integer registers.
49777static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
49778 TargetLowering::DAGCombinerInfo &DCI,
49779 const X86Subtarget &Subtarget) {
49780 EVT VT = N->getValueType(0);
49781 SDValue N0 = N->getOperand(0);
49782 SDValue N1 = N->getOperand(1);
49783 SDLoc DL(N);
49784
49785 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
49786 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
49787 return SDValue();
49788
49789 SDValue N00 = N0.getOperand(0);
49790 SDValue N10 = N1.getOperand(0);
49791 EVT N00Type = N00.getValueType();
49792 EVT N10Type = N10.getValueType();
49793
49794 // Ensure that both types are the same and are legal scalar fp types.
49795 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
49796 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
49797 (Subtarget.hasFP16() && N00Type == MVT::f16)))
49798 return SDValue();
49799
49800 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
49801 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
49802 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
49803 return DAG.getBitcast(VT, FPLogic);
49804 }
49805
49806 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
49807 !N1.hasOneUse())
49808 return SDValue();
49809
49810 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49811 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
49812
49813 // The vector ISA for FP predicates is incomplete before AVX, so converting
49814 // COMIS* to CMPS* may not be a win before AVX.
49815 if (!Subtarget.hasAVX() &&
49816 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
49817 return SDValue();
49818
49819 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
49820 // and vector logic:
49821 // logic (setcc N00, N01), (setcc N10, N11) -->
49822 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
49823 unsigned NumElts = 128 / N00Type.getSizeInBits();
49824 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
49825 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
49826 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
49827 SDValue N01 = N0.getOperand(1);
49828 SDValue N11 = N1.getOperand(1);
49829 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
49830 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
49831 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
49832 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
49833 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
49834 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
49835 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
49836 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
49837}
49838
49839// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
49840// to reduce XMM->GPR traffic.
49841static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
49842 unsigned Opc = N->getOpcode();
49843 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__
__PRETTY_FUNCTION__))
49844 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__
__PRETTY_FUNCTION__))
;
49845
49846 SDValue N0 = N->getOperand(0);
49847 SDValue N1 = N->getOperand(1);
49848
49849 // Both operands must be single use MOVMSK.
49850 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
49851 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
49852 return SDValue();
49853
49854 SDValue Vec0 = N0.getOperand(0);
49855 SDValue Vec1 = N1.getOperand(0);
49856 EVT VecVT0 = Vec0.getValueType();
49857 EVT VecVT1 = Vec1.getValueType();
49858
49859 // Both MOVMSK operands must be from vectors of the same size and same element
49860 // size, but its OK for a fp/int diff.
49861 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
49862 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
49863 return SDValue();
49864
49865 SDLoc DL(N);
49866 unsigned VecOpc =
49867 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
49868 SDValue Result =
49869 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
49870 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49871}
49872
49873// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
49874// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
49875// handles in InstCombine.
49876static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
49877 unsigned Opc = N->getOpcode();
49878 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))
49879 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))
;
49880
49881 SDValue N0 = N->getOperand(0);
49882 SDValue N1 = N->getOperand(1);
49883 EVT VT = N->getValueType(0);
49884
49885 // Both operands must be single use.
49886 if (!N0.hasOneUse() || !N1.hasOneUse())
49887 return SDValue();
49888
49889 // Search for matching shifts.
49890 SDValue BC0 = peekThroughOneUseBitcasts(N0);
49891 SDValue BC1 = peekThroughOneUseBitcasts(N1);
49892
49893 unsigned BCOpc = BC0.getOpcode();
49894 EVT BCVT = BC0.getValueType();
49895 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
49896 return SDValue();
49897
49898 switch (BCOpc) {
49899 case X86ISD::VSHLI:
49900 case X86ISD::VSRLI:
49901 case X86ISD::VSRAI: {
49902 if (BC0.getOperand(1) != BC1.getOperand(1))
49903 return SDValue();
49904
49905 SDLoc DL(N);
49906 SDValue BitOp =
49907 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
49908 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
49909 return DAG.getBitcast(VT, Shift);
49910 }
49911 }
49912
49913 return SDValue();
49914}
49915
49916/// If this is a zero/all-bits result that is bitwise-anded with a low bits
49917/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
49918/// with a shift-right to eliminate loading the vector constant mask value.
49919static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
49920 const X86Subtarget &Subtarget) {
49921 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
49922 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
49923 EVT VT = Op0.getValueType();
49924 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
49925 return SDValue();
49926
49927 // Try to convert an "is positive" signbit masking operation into arithmetic
49928 // shift and "andn". This saves a materialization of a -1 vector constant.
49929 // The "is negative" variant should be handled more generally because it only
49930 // requires "and" rather than "andn":
49931 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
49932 //
49933 // This is limited to the original type to avoid producing even more bitcasts.
49934 // If the bitcasts can't be eliminated, then it is unlikely that this fold
49935 // will be profitable.
49936 if (N->getValueType(0) == VT &&
49937 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
49938 SDValue X, Y;
49939 if (Op1.getOpcode() == X86ISD::PCMPGT &&
49940 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
49941 X = Op1.getOperand(0);
49942 Y = Op0;
49943 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
49944 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
49945 X = Op0.getOperand(0);
49946 Y = Op1;
49947 }
49948 if (X && Y) {
49949 SDLoc DL(N);
49950 SDValue Sra =
49951 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
49952 VT.getScalarSizeInBits() - 1, DAG);
49953 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
49954 }
49955 }
49956
49957 APInt SplatVal;
49958 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
49959 !SplatVal.isMask())
49960 return SDValue();
49961
49962 // Don't prevent creation of ANDN.
49963 if (isBitwiseNot(Op0))
49964 return SDValue();
49965
49966 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
49967 return SDValue();
49968
49969 unsigned EltBitWidth = VT.getScalarSizeInBits();
49970 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
49971 return SDValue();
49972
49973 SDLoc DL(N);
49974 unsigned ShiftVal = SplatVal.countr_one();
49975 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
49976 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
49977 return DAG.getBitcast(N->getValueType(0), Shift);
49978}
49979
49980// Get the index node from the lowered DAG of a GEP IR instruction with one
49981// indexing dimension.
49982static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
49983 if (Ld->isIndexed())
49984 return SDValue();
49985
49986 SDValue Base = Ld->getBasePtr();
49987
49988 if (Base.getOpcode() != ISD::ADD)
49989 return SDValue();
49990
49991 SDValue ShiftedIndex = Base.getOperand(0);
49992
49993 if (ShiftedIndex.getOpcode() != ISD::SHL)
49994 return SDValue();
49995
49996 return ShiftedIndex.getOperand(0);
49997
49998}
49999
50000static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
50001 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
50002 switch (VT.getSizeInBits()) {
50003 default: return false;
50004 case 64: return Subtarget.is64Bit() ? true : false;
50005 case 32: return true;
50006 }
50007 }
50008 return false;
50009}
50010
50011// This function recognizes cases where X86 bzhi instruction can replace and
50012// 'and-load' sequence.
50013// In case of loading integer value from an array of constants which is defined
50014// as follows:
50015//
50016// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50017//
50018// then applying a bitwise and on the result with another input.
50019// It's equivalent to performing bzhi (zero high bits) on the input, with the
50020// same index of the load.
50021static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
50022 const X86Subtarget &Subtarget) {
50023 MVT VT = Node->getSimpleValueType(0);
50024 SDLoc dl(Node);
50025
50026 // Check if subtarget has BZHI instruction for the node's type
50027 if (!hasBZHI(Subtarget, VT))
50028 return SDValue();
50029
50030 // Try matching the pattern for both operands.
50031 for (unsigned i = 0; i < 2; i++) {
50032 SDValue N = Node->getOperand(i);
50033 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
50034
50035 // continue if the operand is not a load instruction
50036 if (!Ld)
50037 return SDValue();
50038
50039 const Value *MemOp = Ld->getMemOperand()->getValue();
50040
50041 if (!MemOp)
50042 return SDValue();
50043
50044 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
50045 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50046 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50047
50048 Constant *Init = GV->getInitializer();
50049 Type *Ty = Init->getType();
50050 if (!isa<ConstantDataArray>(Init) ||
50051 !Ty->getArrayElementType()->isIntegerTy() ||
50052 Ty->getArrayElementType()->getScalarSizeInBits() !=
50053 VT.getSizeInBits() ||
50054 Ty->getArrayNumElements() >
50055 Ty->getArrayElementType()->getScalarSizeInBits())
50056 continue;
50057
50058 // Check if the array's constant elements are suitable to our case.
50059 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50060 bool ConstantsMatch = true;
50061 for (uint64_t j = 0; j < ArrayElementCount; j++) {
50062 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50063 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50064 ConstantsMatch = false;
50065 break;
50066 }
50067 }
50068 if (!ConstantsMatch)
50069 continue;
50070
50071 // Do the transformation (For 32-bit type):
50072 // -> (and (load arr[idx]), inp)
50073 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50074 // that will be replaced with one bzhi instruction.
50075 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
50076 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
50077
50078 // Get the Node which indexes into the array.
50079 SDValue Index = getIndexFromUnindexedLoad(Ld);
50080 if (!Index)
50081 return SDValue();
50082 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
50083
50084 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
50085 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
50086
50087 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
50088 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
50089
50090 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
50091 }
50092 }
50093 }
50094 }
50095 return SDValue();
50096}
50097
50098// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
50099// Where C is a mask containing the same number of bits as the setcc and
50100// where the setcc will freely 0 upper bits of k-register. We can replace the
50101// undef in the concat with 0s and remove the AND. This mainly helps with
50102// v2i1/v4i1 setcc being casted to scalar.
50103static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
50104 const X86Subtarget &Subtarget) {
50105 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50105, __extension__
__PRETTY_FUNCTION__))
;
50106
50107 EVT VT = N->getValueType(0);
50108
50109 // Make sure this is an AND with constant. We will check the value of the
50110 // constant later.
50111 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50112 if (!C1)
50113 return SDValue();
50114
50115 // This is implied by the ConstantSDNode.
50116 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50116, __extension__
__PRETTY_FUNCTION__))
;
50117
50118 SDValue Src = N->getOperand(0);
50119 if (!Src.hasOneUse())
50120 return SDValue();
50121
50122 // (Optionally) peek through any_extend().
50123 if (Src.getOpcode() == ISD::ANY_EXTEND) {
50124 if (!Src.getOperand(0).hasOneUse())
50125 return SDValue();
50126 Src = Src.getOperand(0);
50127 }
50128
50129 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
50130 return SDValue();
50131
50132 Src = Src.getOperand(0);
50133 EVT SrcVT = Src.getValueType();
50134
50135 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50136 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
50137 !TLI.isTypeLegal(SrcVT))
50138 return SDValue();
50139
50140 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
50141 return SDValue();
50142
50143 // We only care about the first subvector of the concat, we expect the
50144 // other subvectors to be ignored due to the AND if we make the change.
50145 SDValue SubVec = Src.getOperand(0);
50146 EVT SubVecVT = SubVec.getValueType();
50147
50148 // The RHS of the AND should be a mask with as many bits as SubVec.
50149 if (!TLI.isTypeLegal(SubVecVT) ||
50150 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50151 return SDValue();
50152
50153 // First subvector should be a setcc with a legal result type or a
50154 // AND containing at least one setcc with a legal result type.
50155 auto IsLegalSetCC = [&](SDValue V) {
50156 if (V.getOpcode() != ISD::SETCC)
50157 return false;
50158 EVT SetccVT = V.getOperand(0).getValueType();
50159 if (!TLI.isTypeLegal(SetccVT) ||
50160 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
50161 return false;
50162 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
50163 return false;
50164 return true;
50165 };
50166 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
50167 (IsLegalSetCC(SubVec.getOperand(0)) ||
50168 IsLegalSetCC(SubVec.getOperand(1))))))
50169 return SDValue();
50170
50171 // We passed all the checks. Rebuild the concat_vectors with zeroes
50172 // and cast it back to VT.
50173 SDLoc dl(N);
50174 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
50175 DAG.getConstant(0, dl, SubVecVT));
50176 Ops[0] = SubVec;
50177 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
50178 Ops);
50179 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
50180 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
50181}
50182
50183static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
50184 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50185 // We don't want to go crazy with the recursion here. This isn't a super
50186 // important optimization.
50187 static constexpr unsigned kMaxDepth = 2;
50188
50189 // Only do this re-ordering if op has one use.
50190 if (!Op.hasOneUse())
50191 return SDValue();
50192
50193 SDLoc DL(Op);
50194 // If we hit another assosiative op, recurse further.
50195 if (Op.getOpcode() == Opc) {
50196 // Done recursing.
50197 if (Depth++ >= kMaxDepth)
50198 return SDValue();
50199
50200 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50201 if (SDValue R =
50202 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50203 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
50204 Op.getOperand(1 - OpIdx));
50205
50206 } else if (Op.getOpcode() == ISD::SUB) {
50207 if (Opc == ISD::AND) {
50208 // BLSI: (and x, (sub 0, x))
50209 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
50210 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50211 }
50212 // Opc must be ISD::AND or ISD::XOR
50213 // BLSR: (and x, (sub x, 1))
50214 // BLSMSK: (xor x, (sub x, 1))
50215 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50216 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50217
50218 } else if (Op.getOpcode() == ISD::ADD) {
50219 // Opc must be ISD::AND or ISD::XOR
50220 // BLSR: (and x, (add x, -1))
50221 // BLSMSK: (xor x, (add x, -1))
50222 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50223 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50224 }
50225 return SDValue();
50226}
50227
50228static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
50229 const X86Subtarget &Subtarget) {
50230 EVT VT = N->getValueType(0);
50231 // Make sure this node is a candidate for BMI instructions.
50232 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
50233 (VT != MVT::i32 && VT != MVT::i64))
50234 return SDValue();
50235
50236 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50236, __extension__
__PRETTY_FUNCTION__))
;
50237
50238 // Try and match LHS and RHS.
50239 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50240 if (SDValue OpMatch =
50241 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50242 N->getOperand(1 - OpIdx), 0))
50243 return OpMatch;
50244 return SDValue();
50245}
50246
50247static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
50248 TargetLowering::DAGCombinerInfo &DCI,
50249 const X86Subtarget &Subtarget) {
50250 SDValue N0 = N->getOperand(0);
50251 SDValue N1 = N->getOperand(1);
50252 EVT VT = N->getValueType(0);
50253 SDLoc dl(N);
50254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50255
50256 // If this is SSE1 only convert to FAND to avoid scalarization.
50257 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50258 return DAG.getBitcast(MVT::v4i32,
50259 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
50260 DAG.getBitcast(MVT::v4f32, N0),
50261 DAG.getBitcast(MVT::v4f32, N1)));
50262 }
50263
50264 // Use a 32-bit and+zext if upper bits known zero.
50265 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
50266 APInt HiMask = APInt::getHighBitsSet(64, 32);
50267 if (DAG.MaskedValueIsZero(N1, HiMask) ||
50268 DAG.MaskedValueIsZero(N0, HiMask)) {
50269 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
50270 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
50271 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
50272 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
50273 }
50274 }
50275
50276 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
50277 // TODO: Support multiple SrcOps.
50278 if (VT == MVT::i1) {
50279 SmallVector<SDValue, 2> SrcOps;
50280 SmallVector<APInt, 2> SrcPartials;
50281 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
50282 SrcOps.size() == 1) {
50283 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50284 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50285 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50286 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50287 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50288 if (Mask) {
50289 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__
__PRETTY_FUNCTION__))
50290 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__
__PRETTY_FUNCTION__))
;
50291 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50292 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50293 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
50294 }
50295 }
50296 }
50297
50298 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
50299 return V;
50300
50301 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50302 return R;
50303
50304 if (SDValue R = combineBitOpWithShift(N, DAG))
50305 return R;
50306
50307 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50308 return FPLogic;
50309
50310 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
50311 return R;
50312
50313 if (DCI.isBeforeLegalizeOps())
50314 return SDValue();
50315
50316 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50317 return R;
50318
50319 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
50320 return R;
50321
50322 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
50323 return ShiftRight;
50324
50325 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
50326 return R;
50327
50328 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
50329 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
50330 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
50331 if (VT.isVector() && getTargetConstantFromNode(N1)) {
50332 unsigned Opc0 = N0.getOpcode();
50333 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
50334 getTargetConstantFromNode(N0.getOperand(1)) &&
50335 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
50336 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
50337 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
50338 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
50339 }
50340 }
50341
50342 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
50343 // avoids slow variable shift (moving shift amount to ECX etc.)
50344 if (isOneConstant(N1) && N0->hasOneUse()) {
50345 SDValue Src = N0;
50346 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
50347 Src.getOpcode() == ISD::TRUNCATE) &&
50348 Src.getOperand(0)->hasOneUse())
50349 Src = Src.getOperand(0);
50350 bool ContainsNOT = false;
50351 X86::CondCode X86CC = X86::COND_B;
50352 // Peek through AND(NOT(SRL(X,Y)),1).
50353 if (isBitwiseNot(Src)) {
50354 Src = Src.getOperand(0);
50355 X86CC = X86::COND_AE;
50356 ContainsNOT = true;
50357 }
50358 if (Src.getOpcode() == ISD::SRL &&
50359 !isa<ConstantSDNode>(Src.getOperand(1))) {
50360 SDValue BitNo = Src.getOperand(1);
50361 Src = Src.getOperand(0);
50362 // Peek through AND(SRL(NOT(X),Y),1).
50363 if (isBitwiseNot(Src)) {
50364 Src = Src.getOperand(0);
50365 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
50366 ContainsNOT = true;
50367 }
50368 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
50369 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
50370 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
50371 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
50372 }
50373 }
50374
50375 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50376 // Attempt to recursively combine a bitmask AND with shuffles.
50377 SDValue Op(N, 0);
50378 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50379 return Res;
50380
50381 // If either operand is a constant mask, then only the elements that aren't
50382 // zero are actually demanded by the other operand.
50383 auto GetDemandedMasks = [&](SDValue Op) {
50384 APInt UndefElts;
50385 SmallVector<APInt> EltBits;
50386 int NumElts = VT.getVectorNumElements();
50387 int EltSizeInBits = VT.getScalarSizeInBits();
50388 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50389 APInt DemandedElts = APInt::getAllOnes(NumElts);
50390 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50391 EltBits)) {
50392 DemandedBits.clearAllBits();
50393 DemandedElts.clearAllBits();
50394 for (int I = 0; I != NumElts; ++I) {
50395 if (UndefElts[I]) {
50396 // We can't assume an undef src element gives an undef dst - the
50397 // other src might be zero.
50398 DemandedBits.setAllBits();
50399 DemandedElts.setBit(I);
50400 } else if (!EltBits[I].isZero()) {
50401 DemandedBits |= EltBits[I];
50402 DemandedElts.setBit(I);
50403 }
50404 }
50405 }
50406 return std::make_pair(DemandedBits, DemandedElts);
50407 };
50408 APInt Bits0, Elts0;
50409 APInt Bits1, Elts1;
50410 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
50411 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
50412
50413 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
50414 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
50415 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
50416 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
50417 if (N->getOpcode() != ISD::DELETED_NODE)
50418 DCI.AddToWorklist(N);
50419 return SDValue(N, 0);
50420 }
50421
50422 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
50423 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
50424 if (NewN0 || NewN1)
50425 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
50426 NewN1 ? NewN1 : N1);
50427 }
50428
50429 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
50430 if ((VT.getScalarSizeInBits() % 8) == 0 &&
50431 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50432 isa<ConstantSDNode>(N0.getOperand(1))) {
50433 SDValue BitMask = N1;
50434 SDValue SrcVec = N0.getOperand(0);
50435 EVT SrcVecVT = SrcVec.getValueType();
50436
50437 // Check that the constant bitmask masks whole bytes.
50438 APInt UndefElts;
50439 SmallVector<APInt, 64> EltBits;
50440 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
50441 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
50442 llvm::all_of(EltBits, [](const APInt &M) {
50443 return M.isZero() || M.isAllOnes();
50444 })) {
50445 unsigned NumElts = SrcVecVT.getVectorNumElements();
50446 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
50447 unsigned Idx = N0.getConstantOperandVal(1);
50448
50449 // Create a root shuffle mask from the byte mask and the extracted index.
50450 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
50451 for (unsigned i = 0; i != Scale; ++i) {
50452 if (UndefElts[i])
50453 continue;
50454 int VecIdx = Scale * Idx + i;
50455 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
50456 }
50457
50458 if (SDValue Shuffle = combineX86ShufflesRecursively(
50459 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
50460 X86::MaxShuffleCombineDepth,
50461 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
50462 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
50463 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
50464 N0.getOperand(1));
50465 }
50466 }
50467
50468 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
50469 return R;
50470
50471 return SDValue();
50472}
50473
50474// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
50475static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
50476 const X86Subtarget &Subtarget) {
50477 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50477, __extension__
__PRETTY_FUNCTION__))
;
50478
50479 MVT VT = N->getSimpleValueType(0);
50480 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50481 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
50482 return SDValue();
50483
50484 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
50485 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
50486 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
50487 return SDValue();
50488
50489 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
50490 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
50491 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
50492 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
50493 return SDValue();
50494
50495 // Attempt to extract constant byte masks.
50496 APInt UndefElts0, UndefElts1;
50497 SmallVector<APInt, 32> EltBits0, EltBits1;
50498 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
50499 false, false))
50500 return SDValue();
50501 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
50502 false, false))
50503 return SDValue();
50504
50505 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
50506 // TODO - add UNDEF elts support.
50507 if (UndefElts0[i] || UndefElts1[i])
50508 return SDValue();
50509 if (EltBits0[i] != ~EltBits1[i])
50510 return SDValue();
50511 }
50512
50513 SDLoc DL(N);
50514
50515 if (useVPTERNLOG(Subtarget, VT)) {
50516 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
50517 // VPTERNLOG is only available as vXi32/64-bit types.
50518 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
50519 MVT OpVT =
50520 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
50521 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
50522 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
50523 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
50524 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
50525 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
50526 DAG, Subtarget);
50527 return DAG.getBitcast(VT, Res);
50528 }
50529
50530 SDValue X = N->getOperand(0);
50531 SDValue Y =
50532 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
50533 DAG.getBitcast(VT, N1.getOperand(0)));
50534 return DAG.getNode(ISD::OR, DL, VT, X, Y);
50535}
50536
50537// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
50538static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
50539 if (N->getOpcode() != ISD::OR)
50540 return false;
50541
50542 SDValue N0 = N->getOperand(0);
50543 SDValue N1 = N->getOperand(1);
50544
50545 // Canonicalize AND to LHS.
50546 if (N1.getOpcode() == ISD::AND)
50547 std::swap(N0, N1);
50548
50549 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
50550 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
50551 return false;
50552
50553 Mask = N1.getOperand(0);
50554 X = N1.getOperand(1);
50555
50556 // Check to see if the mask appeared in both the AND and ANDNP.
50557 if (N0.getOperand(0) == Mask)
50558 Y = N0.getOperand(1);
50559 else if (N0.getOperand(1) == Mask)
50560 Y = N0.getOperand(0);
50561 else
50562 return false;
50563
50564 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
50565 // ANDNP combine allows other combines to happen that prevent matching.
50566 return true;
50567}
50568
50569// Try to fold:
50570// (or (and (m, y), (pandn m, x)))
50571// into:
50572// (vselect m, x, y)
50573// As a special case, try to fold:
50574// (or (and (m, (sub 0, x)), (pandn m, x)))
50575// into:
50576// (sub (xor X, M), M)
50577static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
50578 const X86Subtarget &Subtarget) {
50579 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50579, __extension__
__PRETTY_FUNCTION__))
;
50580
50581 EVT VT = N->getValueType(0);
50582 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50583 (VT.is256BitVector() && Subtarget.hasInt256())))
50584 return SDValue();
50585
50586 SDValue X, Y, Mask;
50587 if (!matchLogicBlend(N, X, Y, Mask))
50588 return SDValue();
50589
50590 // Validate that X, Y, and Mask are bitcasts, and see through them.
50591 Mask = peekThroughBitcasts(Mask);
50592 X = peekThroughBitcasts(X);
50593 Y = peekThroughBitcasts(Y);
50594
50595 EVT MaskVT = Mask.getValueType();
50596 unsigned EltBits = MaskVT.getScalarSizeInBits();
50597
50598 // TODO: Attempt to handle floating point cases as well?
50599 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
50600 return SDValue();
50601
50602 SDLoc DL(N);
50603
50604 // Attempt to combine to conditional negate: (sub (xor X, M), M)
50605 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
50606 DAG, Subtarget))
50607 return Res;
50608
50609 // PBLENDVB is only available on SSE 4.1.
50610 if (!Subtarget.hasSSE41())
50611 return SDValue();
50612
50613 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
50614 if (Subtarget.hasVLX())
50615 return SDValue();
50616
50617 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
50618
50619 X = DAG.getBitcast(BlendVT, X);
50620 Y = DAG.getBitcast(BlendVT, Y);
50621 Mask = DAG.getBitcast(BlendVT, Mask);
50622 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
50623 return DAG.getBitcast(VT, Mask);
50624}
50625
50626// Helper function for combineOrCmpEqZeroToCtlzSrl
50627// Transforms:
50628// seteq(cmp x, 0)
50629// into:
50630// srl(ctlz x), log2(bitsize(x))
50631// Input pattern is checked by caller.
50632static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
50633 SDValue Cmp = Op.getOperand(1);
50634 EVT VT = Cmp.getOperand(0).getValueType();
50635 unsigned Log2b = Log2_32(VT.getSizeInBits());
50636 SDLoc dl(Op);
50637 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50638 // The result of the shift is true or false, and on X86, the 32-bit
50639 // encoding of shr and lzcnt is more desirable.
50640 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
50641 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
50642 DAG.getConstant(Log2b, dl, MVT::i8));
50643 return Scc;
50644}
50645
50646// Try to transform:
50647// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
50648// into:
50649// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
50650// Will also attempt to match more generic cases, eg:
50651// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
50652// Only applies if the target supports the FastLZCNT feature.
50653static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
50654 TargetLowering::DAGCombinerInfo &DCI,
50655 const X86Subtarget &Subtarget) {
50656 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50657 return SDValue();
50658
50659 auto isORCandidate = [](SDValue N) {
50660 return (N->getOpcode() == ISD::OR && N->hasOneUse());
50661 };
50662
50663 // Check the zero extend is extending to 32-bit or more. The code generated by
50664 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
50665 // instructions to clear the upper bits.
50666 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
50667 !isORCandidate(N->getOperand(0)))
50668 return SDValue();
50669
50670 // Check the node matches: setcc(eq, cmp 0)
50671 auto isSetCCCandidate = [](SDValue N) {
50672 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
50673 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
50674 N->getOperand(1).getOpcode() == X86ISD::CMP &&
50675 isNullConstant(N->getOperand(1).getOperand(1)) &&
50676 N->getOperand(1).getValueType().bitsGE(MVT::i32);
50677 };
50678
50679 SDNode *OR = N->getOperand(0).getNode();
50680 SDValue LHS = OR->getOperand(0);
50681 SDValue RHS = OR->getOperand(1);
50682
50683 // Save nodes matching or(or, setcc(eq, cmp 0)).
50684 SmallVector<SDNode *, 2> ORNodes;
50685 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
50686 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
50687 ORNodes.push_back(OR);
50688 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
50689 LHS = OR->getOperand(0);
50690 RHS = OR->getOperand(1);
50691 }
50692
50693 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
50694 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
50695 !isORCandidate(SDValue(OR, 0)))
50696 return SDValue();
50697
50698 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
50699 // to
50700 // or(srl(ctlz),srl(ctlz)).
50701 // The dag combiner can then fold it into:
50702 // srl(or(ctlz, ctlz)).
50703 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
50704 SDValue Ret, NewRHS;
50705 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
50706 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
50707
50708 if (!Ret)
50709 return SDValue();
50710
50711 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
50712 while (!ORNodes.empty()) {
50713 OR = ORNodes.pop_back_val();
50714 LHS = OR->getOperand(0);
50715 RHS = OR->getOperand(1);
50716 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
50717 if (RHS->getOpcode() == ISD::OR)
50718 std::swap(LHS, RHS);
50719 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
50720 if (!NewRHS)
50721 return SDValue();
50722 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
50723 }
50724
50725 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
50726}
50727
50728static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
50729 SDValue And1_L, SDValue And1_R,
50730 const SDLoc &DL, SelectionDAG &DAG) {
50731 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
50732 return SDValue();
50733 SDValue NotOp = And0_L->getOperand(0);
50734 if (NotOp == And1_R)
50735 std::swap(And1_R, And1_L);
50736 if (NotOp != And1_L)
50737 return SDValue();
50738
50739 // (~(NotOp) & And0_R) | (NotOp & And1_R)
50740 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
50741 EVT VT = And1_L->getValueType(0);
50742 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
50743 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
50744 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
50745 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
50746 return Xor1;
50747}
50748
50749/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
50750/// equivalent `((x ^ y) & m) ^ y)` pattern.
50751/// This is typically a better representation for targets without a fused
50752/// "and-not" operation. This function is intended to be called from a
50753/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
50754static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
50755 // Note that masked-merge variants using XOR or ADD expressions are
50756 // normalized to OR by InstCombine so we only check for OR.
50757 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50757, __extension__
__PRETTY_FUNCTION__))
;
50758 SDValue N0 = Node->getOperand(0);
50759 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
50760 return SDValue();
50761 SDValue N1 = Node->getOperand(1);
50762 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
50763 return SDValue();
50764
50765 SDLoc DL(Node);
50766 SDValue N00 = N0->getOperand(0);
50767 SDValue N01 = N0->getOperand(1);
50768 SDValue N10 = N1->getOperand(0);
50769 SDValue N11 = N1->getOperand(1);
50770 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
50771 return Result;
50772 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
50773 return Result;
50774 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
50775 return Result;
50776 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
50777 return Result;
50778 return SDValue();
50779}
50780
50781/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50782/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50783/// with CMP+{ADC, SBB}.
50784/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
50785static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
50786 SDValue X, SDValue Y,
50787 SelectionDAG &DAG,
50788 bool ZeroSecondOpOnly = false) {
50789 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
50790 return SDValue();
50791
50792 // Look through a one-use zext.
50793 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
50794 Y = Y.getOperand(0);
50795
50796 X86::CondCode CC;
50797 SDValue EFLAGS;
50798 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
50799 CC = (X86::CondCode)Y.getConstantOperandVal(0);
50800 EFLAGS = Y.getOperand(1);
50801 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
50802 Y.hasOneUse()) {
50803 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
50804 }
50805
50806 if (!EFLAGS)
50807 return SDValue();
50808
50809 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50810 // the general case below.
50811 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50812 if (ConstantX && !ZeroSecondOpOnly) {
50813 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
50814 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
50815 // This is a complicated way to get -1 or 0 from the carry flag:
50816 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50817 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50818 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50819 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50820 EFLAGS);
50821 }
50822
50823 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
50824 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
50825 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50826 EFLAGS.getValueType().isInteger() &&
50827 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50828 // Swap the operands of a SUB, and we have the same pattern as above.
50829 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50830 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50831 SDValue NewSub = DAG.getNode(
50832 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50833 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50834 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50835 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50836 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50837 NewEFLAGS);
50838 }
50839 }
50840 }
50841
50842 if (CC == X86::COND_B) {
50843 // X + SETB Z --> adc X, 0
50844 // X - SETB Z --> sbb X, 0
50845 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50846 DAG.getVTList(VT, MVT::i32), X,
50847 DAG.getConstant(0, DL, VT), EFLAGS);
50848 }
50849
50850 if (ZeroSecondOpOnly)
50851 return SDValue();
50852
50853 if (CC == X86::COND_A) {
50854 // Try to convert COND_A into COND_B in an attempt to facilitate
50855 // materializing "setb reg".
50856 //
50857 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50858 // cannot take an immediate as its first operand.
50859 //
50860 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50861 EFLAGS.getValueType().isInteger() &&
50862 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50863 SDValue NewSub =
50864 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50865 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50866 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50867 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50868 DAG.getVTList(VT, MVT::i32), X,
50869 DAG.getConstant(0, DL, VT), NewEFLAGS);
50870 }
50871 }
50872
50873 if (CC == X86::COND_AE) {
50874 // X + SETAE --> sbb X, -1
50875 // X - SETAE --> adc X, -1
50876 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50877 DAG.getVTList(VT, MVT::i32), X,
50878 DAG.getConstant(-1, DL, VT), EFLAGS);
50879 }
50880
50881 if (CC == X86::COND_BE) {
50882 // X + SETBE --> sbb X, -1
50883 // X - SETBE --> adc X, -1
50884 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50885 // materializing "setae reg".
50886 //
50887 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50888 // cannot take an immediate as its first operand.
50889 //
50890 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50891 EFLAGS.getValueType().isInteger() &&
50892 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50893 SDValue NewSub =
50894 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50895 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50896 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50897 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50898 DAG.getVTList(VT, MVT::i32), X,
50899 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50900 }
50901 }
50902
50903 if (CC != X86::COND_E && CC != X86::COND_NE)
50904 return SDValue();
50905
50906 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
50907 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
50908 !EFLAGS.getOperand(0).getValueType().isInteger())
50909 return SDValue();
50910
50911 SDValue Z = EFLAGS.getOperand(0);
50912 EVT ZVT = Z.getValueType();
50913
50914 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50915 // the general case below.
50916 if (ConstantX) {
50917 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50918 // fake operands:
50919 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50920 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50921 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
50922 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
50923 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50924 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50925 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50926 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50927 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50928 SDValue(Neg.getNode(), 1));
50929 }
50930
50931 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50932 // with fake operands:
50933 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50934 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50935 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
50936 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
50937 SDValue One = DAG.getConstant(1, DL, ZVT);
50938 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50939 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50940 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50941 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50942 Cmp1.getValue(1));
50943 }
50944 }
50945
50946 // (cmp Z, 1) sets the carry flag if Z is 0.
50947 SDValue One = DAG.getConstant(1, DL, ZVT);
50948 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50949 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50950
50951 // Add the flags type for ADC/SBB nodes.
50952 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50953
50954 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50955 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50956 if (CC == X86::COND_NE)
50957 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50958 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50959
50960 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50961 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50962 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50963 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50964}
50965
50966/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50967/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50968/// with CMP+{ADC, SBB}.
50969static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
50970 bool IsSub = N->getOpcode() == ISD::SUB;
50971 SDValue X = N->getOperand(0);
50972 SDValue Y = N->getOperand(1);
50973 EVT VT = N->getValueType(0);
50974 SDLoc DL(N);
50975
50976 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
50977 return ADCOrSBB;
50978
50979 // Commute and try again (negate the result for subtracts).
50980 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
50981 if (IsSub)
50982 ADCOrSBB =
50983 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
50984 return ADCOrSBB;
50985 }
50986
50987 return SDValue();
50988}
50989
50990static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
50991 SelectionDAG &DAG) {
50992 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__
__PRETTY_FUNCTION__))
50993 "Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__
__PRETTY_FUNCTION__))
;
50994
50995 // Delegate to combineAddOrSubToADCOrSBB if we have:
50996 //
50997 // (xor/or (zero_extend (setcc)) imm)
50998 //
50999 // where imm is odd if and only if we have xor, in which case the XOR/OR are
51000 // equivalent to a SUB/ADD, respectively.
51001 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
51002 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
51003 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
51004 bool IsSub = N->getOpcode() == ISD::XOR;
51005 bool N1COdd = N1C->getZExtValue() & 1;
51006 if (IsSub ? N1COdd : !N1COdd) {
51007 SDLoc DL(N);
51008 EVT VT = N->getValueType(0);
51009 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
51010 return R;
51011 }
51012 }
51013 }
51014
51015 return SDValue();
51016}
51017
51018static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
51019 TargetLowering::DAGCombinerInfo &DCI,
51020 const X86Subtarget &Subtarget) {
51021 SDValue N0 = N->getOperand(0);
51022 SDValue N1 = N->getOperand(1);
51023 EVT VT = N->getValueType(0);
51024 SDLoc dl(N);
51025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51026
51027 // If this is SSE1 only convert to FOR to avoid scalarization.
51028 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51029 return DAG.getBitcast(MVT::v4i32,
51030 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
51031 DAG.getBitcast(MVT::v4f32, N0),
51032 DAG.getBitcast(MVT::v4f32, N1)));
51033 }
51034
51035 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51036 // TODO: Support multiple SrcOps.
51037 if (VT == MVT::i1) {
51038 SmallVector<SDValue, 2> SrcOps;
51039 SmallVector<APInt, 2> SrcPartials;
51040 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
51041 SrcOps.size() == 1) {
51042 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51043 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51044 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51045 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51046 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51047 if (Mask) {
51048 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__
__PRETTY_FUNCTION__))
51049 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__
__PRETTY_FUNCTION__))
;
51050 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
51051 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51052 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51053 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
51054 }
51055 }
51056 }
51057
51058 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
51059 return R;
51060
51061 if (SDValue R = combineBitOpWithShift(N, DAG))
51062 return R;
51063
51064 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
51065 return FPLogic;
51066
51067 if (DCI.isBeforeLegalizeOps())
51068 return SDValue();
51069
51070 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51071 return R;
51072
51073 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
51074 return R;
51075
51076 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
51077 return R;
51078
51079 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
51080 if ((VT == MVT::i32 || VT == MVT::i64) &&
51081 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
51082 isNullConstant(N0.getOperand(0))) {
51083 SDValue Cond = N0.getOperand(1);
51084 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
51085 Cond = Cond.getOperand(0);
51086
51087 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
51088 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
51089 uint64_t Val = CN->getZExtValue();
51090 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
51091 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
51092 CCode = X86::GetOppositeBranchCondition(CCode);
51093 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
51094
51095 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
51096 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
51097 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
51098 return R;
51099 }
51100 }
51101 }
51102 }
51103
51104 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
51105 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
51106 // iff the upper elements of the non-shifted arg are zero.
51107 // KUNPCK require 16+ bool vector elements.
51108 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
51109 unsigned NumElts = VT.getVectorNumElements();
51110 unsigned HalfElts = NumElts / 2;
51111 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
51112 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
51113 N1.getConstantOperandAPInt(1) == HalfElts &&
51114 DAG.MaskedVectorIsZero(N0, UpperElts)) {
51115 return DAG.getNode(
51116 ISD::CONCAT_VECTORS, dl, VT,
51117 extractSubVector(N0, 0, DAG, dl, HalfElts),
51118 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
51119 }
51120 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
51121 N0.getConstantOperandAPInt(1) == HalfElts &&
51122 DAG.MaskedVectorIsZero(N1, UpperElts)) {
51123 return DAG.getNode(
51124 ISD::CONCAT_VECTORS, dl, VT,
51125 extractSubVector(N1, 0, DAG, dl, HalfElts),
51126 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
51127 }
51128 }
51129
51130 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51131 // Attempt to recursively combine an OR of shuffles.
51132 SDValue Op(N, 0);
51133 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51134 return Res;
51135
51136 // If either operand is a constant mask, then only the elements that aren't
51137 // allones are actually demanded by the other operand.
51138 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
51139 APInt UndefElts;
51140 SmallVector<APInt> EltBits;
51141 int NumElts = VT.getVectorNumElements();
51142 int EltSizeInBits = VT.getScalarSizeInBits();
51143 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
51144 return false;
51145
51146 APInt DemandedElts = APInt::getZero(NumElts);
51147 for (int I = 0; I != NumElts; ++I)
51148 if (!EltBits[I].isAllOnes())
51149 DemandedElts.setBit(I);
51150
51151 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
51152 };
51153 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
51154 if (N->getOpcode() != ISD::DELETED_NODE)
51155 DCI.AddToWorklist(N);
51156 return SDValue(N, 0);
51157 }
51158 }
51159
51160 // We should fold "masked merge" patterns when `andn` is not available.
51161 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
51162 if (SDValue R = foldMaskedMerge(N, DAG))
51163 return R;
51164
51165 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
51166 return R;
51167
51168 return SDValue();
51169}
51170
51171/// Try to turn tests against the signbit in the form of:
51172/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
51173/// into:
51174/// SETGT(X, -1)
51175static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
51176 // This is only worth doing if the output type is i8 or i1.
51177 EVT ResultType = N->getValueType(0);
51178 if (ResultType != MVT::i8 && ResultType != MVT::i1)
51179 return SDValue();
51180
51181 SDValue N0 = N->getOperand(0);
51182 SDValue N1 = N->getOperand(1);
51183
51184 // We should be performing an xor against a truncated shift.
51185 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
51186 return SDValue();
51187
51188 // Make sure we are performing an xor against one.
51189 if (!isOneConstant(N1))
51190 return SDValue();
51191
51192 // SetCC on x86 zero extends so only act on this if it's a logical shift.
51193 SDValue Shift = N0.getOperand(0);
51194 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
51195 return SDValue();
51196
51197 // Make sure we are truncating from one of i16, i32 or i64.
51198 EVT ShiftTy = Shift.getValueType();
51199 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
51200 return SDValue();
51201
51202 // Make sure the shift amount extracts the sign bit.
51203 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
51204 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
51205 return SDValue();
51206
51207 // Create a greater-than comparison against -1.
51208 // N.B. Using SETGE against 0 works but we want a canonical looking
51209 // comparison, using SETGT matches up with what TranslateX86CC.
51210 SDLoc DL(N);
51211 SDValue ShiftOp = Shift.getOperand(0);
51212 EVT ShiftOpTy = ShiftOp.getValueType();
51213 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51214 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
51215 *DAG.getContext(), ResultType);
51216 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
51217 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
51218 if (SetCCResultType != ResultType)
51219 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
51220 return Cond;
51221}
51222
51223/// Turn vector tests of the signbit in the form of:
51224/// xor (sra X, elt_size(X)-1), -1
51225/// into:
51226/// pcmpgt X, -1
51227///
51228/// This should be called before type legalization because the pattern may not
51229/// persist after that.
51230static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
51231 const X86Subtarget &Subtarget) {
51232 EVT VT = N->getValueType(0);
51233 if (!VT.isSimple())
51234 return SDValue();
51235
51236 switch (VT.getSimpleVT().SimpleTy) {
51237 default: return SDValue();
51238 case MVT::v16i8:
51239 case MVT::v8i16:
51240 case MVT::v4i32:
51241 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
51242 case MVT::v32i8:
51243 case MVT::v16i16:
51244 case MVT::v8i32:
51245 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
51246 }
51247
51248 // There must be a shift right algebraic before the xor, and the xor must be a
51249 // 'not' operation.
51250 SDValue Shift = N->getOperand(0);
51251 SDValue Ones = N->getOperand(1);
51252 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
51253 !ISD::isBuildVectorAllOnes(Ones.getNode()))
51254 return SDValue();
51255
51256 // The shift should be smearing the sign bit across each vector element.
51257 auto *ShiftAmt =
51258 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
51259 if (!ShiftAmt ||
51260 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
51261 return SDValue();
51262
51263 // Create a greater-than comparison against -1. We don't use the more obvious
51264 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
51265 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
51266}
51267
51268/// Detect patterns of truncation with unsigned saturation:
51269///
51270/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
51271/// Return the source value x to be truncated or SDValue() if the pattern was
51272/// not matched.
51273///
51274/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
51275/// where C1 >= 0 and C2 is unsigned max of destination type.
51276///
51277/// (truncate (smax (smin (x, C2), C1)) to dest_type)
51278/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
51279///
51280/// These two patterns are equivalent to:
51281/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
51282/// So return the smax(x, C1) value to be truncated or SDValue() if the
51283/// pattern was not matched.
51284static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51285 const SDLoc &DL) {
51286 EVT InVT = In.getValueType();
51287
51288 // Saturation with truncation. We truncate from InVT to VT.
51289 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__
__PRETTY_FUNCTION__))
51290 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__
__PRETTY_FUNCTION__))
;
51291
51292 // Match min/max and return limit value as a parameter.
51293 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
51294 if (V.getOpcode() == Opcode &&
51295 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
51296 return V.getOperand(0);
51297 return SDValue();
51298 };
51299
51300 APInt C1, C2;
51301 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
51302 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
51303 // the element size of the destination type.
51304 if (C2.isMask(VT.getScalarSizeInBits()))
51305 return UMin;
51306
51307 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
51308 if (MatchMinMax(SMin, ISD::SMAX, C1))
51309 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
51310 return SMin;
51311
51312 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
51313 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
51314 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
51315 C2.uge(C1)) {
51316 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
51317 }
51318
51319 return SDValue();
51320}
51321
51322/// Detect patterns of truncation with signed saturation:
51323/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
51324/// signed_max_of_dest_type)) to dest_type)
51325/// or:
51326/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
51327/// signed_min_of_dest_type)) to dest_type).
51328/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
51329/// Return the source value to be truncated or SDValue() if the pattern was not
51330/// matched.
51331static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
51332 unsigned NumDstBits = VT.getScalarSizeInBits();
51333 unsigned NumSrcBits = In.getScalarValueSizeInBits();
51334 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51334, __extension__
__PRETTY_FUNCTION__))
;
51335
51336 auto MatchMinMax = [](SDValue V, unsigned Opcode,
51337 const APInt &Limit) -> SDValue {
51338 APInt C;
51339 if (V.getOpcode() == Opcode &&
51340 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
51341 return V.getOperand(0);
51342 return SDValue();
51343 };
51344
51345 APInt SignedMax, SignedMin;
51346 if (MatchPackUS) {
51347 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
51348 SignedMin = APInt(NumSrcBits, 0);
51349 } else {
51350 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
51351 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
51352 }
51353
51354 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
51355 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
51356 return SMax;
51357
51358 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
51359 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
51360 return SMin;
51361
51362 return SDValue();
51363}
51364
51365static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
51366 SelectionDAG &DAG,
51367 const X86Subtarget &Subtarget) {
51368 if (!Subtarget.hasSSE2() || !VT.isVector())
51369 return SDValue();
51370
51371 EVT SVT = VT.getVectorElementType();
51372 EVT InVT = In.getValueType();
51373 EVT InSVT = InVT.getVectorElementType();
51374
51375 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
51376 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
51377 // and concatenate at the same time. Then we can use a final vpmovuswb to
51378 // clip to 0-255.
51379 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
51380 InVT == MVT::v16i32 && VT == MVT::v16i8) {
51381 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51382 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
51383 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
51384 DL, DAG, Subtarget);
51385 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51385, __extension__
__PRETTY_FUNCTION__))
;
51386 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
51387 }
51388 }
51389
51390 // vXi32 truncate instructions are available with AVX512F.
51391 // vXi16 truncate instructions are only available with AVX512BW.
51392 // For 256-bit or smaller vectors, we require VLX.
51393 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
51394 // If the result type is 256-bits or larger and we have disable 512-bit
51395 // registers, we should go ahead and use the pack instructions if possible.
51396 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
51397 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
51398 (InVT.getSizeInBits() > 128) &&
51399 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
51400 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
51401
51402 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
51403 VT.getSizeInBits() >= 64 &&
51404 (SVT == MVT::i8 || SVT == MVT::i16) &&
51405 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
51406 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51407 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
51408 // Only do this when the result is at least 64 bits or we'll leaving
51409 // dangling PACKSSDW nodes.
51410 if (SVT == MVT::i8 && InSVT == MVT::i32) {
51411 EVT MidVT = VT.changeVectorElementType(MVT::i16);
51412 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
51413 DAG, Subtarget);
51414 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51414, __extension__
__PRETTY_FUNCTION__))
;
51415 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
51416 Subtarget);
51417 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51417, __extension__ __PRETTY_FUNCTION__))
;
51418 return V;
51419 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
51420 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
51421 Subtarget);
51422 }
51423 if (SDValue SSatVal = detectSSatPattern(In, VT))
51424 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
51425 Subtarget);
51426 }
51427
51428 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51429 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
51430 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
51431 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
51432 unsigned TruncOpc = 0;
51433 SDValue SatVal;
51434 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
51435 SatVal = SSatVal;
51436 TruncOpc = X86ISD::VTRUNCS;
51437 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
51438 SatVal = USatVal;
51439 TruncOpc = X86ISD::VTRUNCUS;
51440 }
51441 if (SatVal) {
51442 unsigned ResElts = VT.getVectorNumElements();
51443 // If the input type is less than 512 bits and we don't have VLX, we need
51444 // to widen to 512 bits.
51445 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
51446 unsigned NumConcats = 512 / InVT.getSizeInBits();
51447 ResElts *= NumConcats;
51448 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
51449 ConcatOps[0] = SatVal;
51450 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
51451 NumConcats * InVT.getVectorNumElements());
51452 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
51453 }
51454 // Widen the result if its narrower than 128 bits.
51455 if (ResElts * SVT.getSizeInBits() < 128)
51456 ResElts = 128 / SVT.getSizeInBits();
51457 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
51458 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
51459 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51460 DAG.getIntPtrConstant(0, DL));
51461 }
51462 }
51463
51464 return SDValue();
51465}
51466
51467/// This function detects the AVG pattern between vectors of unsigned i8/i16,
51468/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
51469/// ISD::AVGCEILU (AVG) instruction.
51470static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51471 const X86Subtarget &Subtarget,
51472 const SDLoc &DL) {
51473 if (!VT.isVector())
51474 return SDValue();
51475 EVT InVT = In.getValueType();
51476 unsigned NumElems = VT.getVectorNumElements();
51477
51478 EVT ScalarVT = VT.getVectorElementType();
51479 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
51480 return SDValue();
51481
51482 // InScalarVT is the intermediate type in AVG pattern and it should be greater
51483 // than the original input type (i8/i16).
51484 EVT InScalarVT = InVT.getVectorElementType();
51485 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
51486 return SDValue();
51487
51488 if (!Subtarget.hasSSE2())
51489 return SDValue();
51490
51491 // Detect the following pattern:
51492 //
51493 // %1 = zext <N x i8> %a to <N x i32>
51494 // %2 = zext <N x i8> %b to <N x i32>
51495 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
51496 // %4 = add nuw nsw <N x i32> %3, %2
51497 // %5 = lshr <N x i32> %N, <i32 1 x N>
51498 // %6 = trunc <N x i32> %5 to <N x i8>
51499 //
51500 // In AVX512, the last instruction can also be a trunc store.
51501 if (In.getOpcode() != ISD::SRL)
51502 return SDValue();
51503
51504 // A lambda checking the given SDValue is a constant vector and each element
51505 // is in the range [Min, Max].
51506 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
51507 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
51508 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
51509 });
51510 };
51511
51512 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
51513 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
51514 return MaxActiveBits <= ScalarVT.getSizeInBits();
51515 };
51516
51517 // Check if each element of the vector is right-shifted by one.
51518 SDValue LHS = In.getOperand(0);
51519 SDValue RHS = In.getOperand(1);
51520 if (!IsConstVectorInRange(RHS, 1, 1))
51521 return SDValue();
51522 if (LHS.getOpcode() != ISD::ADD)
51523 return SDValue();
51524
51525 // Detect a pattern of a + b + 1 where the order doesn't matter.
51526 SDValue Operands[3];
51527 Operands[0] = LHS.getOperand(0);
51528 Operands[1] = LHS.getOperand(1);
51529
51530 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51531 ArrayRef<SDValue> Ops) {
51532 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
51533 };
51534
51535 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
51536 for (SDValue &Op : Ops)
51537 if (Op.getValueType() != VT)
51538 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
51539 // Pad to a power-of-2 vector, split+apply and extract the original vector.
51540 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
51541 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
51542 if (NumElemsPow2 != NumElems) {
51543 for (SDValue &Op : Ops) {
51544 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
51545 for (unsigned i = 0; i != NumElems; ++i) {
51546 SDValue Idx = DAG.getIntPtrConstant(i, DL);
51547 EltsOfOp[i] =
51548 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
51549 }
51550 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
51551 }
51552 }
51553 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
51554 if (NumElemsPow2 == NumElems)
51555 return Res;
51556 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51557 DAG.getIntPtrConstant(0, DL));
51558 };
51559
51560 // Take care of the case when one of the operands is a constant vector whose
51561 // element is in the range [1, 256].
51562 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
51563 IsZExtLike(Operands[0])) {
51564 // The pattern is detected. Subtract one from the constant vector, then
51565 // demote it and emit X86ISD::AVG instruction.
51566 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
51567 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
51568 return AVGSplitter({Operands[0], Operands[1]});
51569 }
51570
51571 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
51572 // Match the or case only if its 'add-like' - can be replaced by an add.
51573 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
51574 if (ISD::ADD == V.getOpcode()) {
51575 Op0 = V.getOperand(0);
51576 Op1 = V.getOperand(1);
51577 return true;
51578 }
51579 if (ISD::ZERO_EXTEND != V.getOpcode())
51580 return false;
51581 V = V.getOperand(0);
51582 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
51583 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
51584 return false;
51585 Op0 = V.getOperand(0);
51586 Op1 = V.getOperand(1);
51587 return true;
51588 };
51589
51590 SDValue Op0, Op1;
51591 if (FindAddLike(Operands[0], Op0, Op1))
51592 std::swap(Operands[0], Operands[1]);
51593 else if (!FindAddLike(Operands[1], Op0, Op1))
51594 return SDValue();
51595 Operands[2] = Op0;
51596 Operands[1] = Op1;
51597
51598 // Now we have three operands of two additions. Check that one of them is a
51599 // constant vector with ones, and the other two can be promoted from i8/i16.
51600 for (SDValue &Op : Operands) {
51601 if (!IsConstVectorInRange(Op, 1, 1))
51602 continue;
51603 std::swap(Op, Operands[2]);
51604
51605 // Check if Operands[0] and Operands[1] are results of type promotion.
51606 for (int j = 0; j < 2; ++j)
51607 if (Operands[j].getValueType() != VT)
51608 if (!IsZExtLike(Operands[j]))
51609 return SDValue();
51610
51611 // The pattern is detected, emit X86ISD::AVG instruction(s).
51612 return AVGSplitter({Operands[0], Operands[1]});
51613 }
51614
51615 return SDValue();
51616}
51617
51618static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
51619 TargetLowering::DAGCombinerInfo &DCI,
51620 const X86Subtarget &Subtarget) {
51621 LoadSDNode *Ld = cast<LoadSDNode>(N);
51622 EVT RegVT = Ld->getValueType(0);
51623 EVT MemVT = Ld->getMemoryVT();
51624 SDLoc dl(Ld);
51625 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51626
51627 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
51628 // into two 16-byte operations. Also split non-temporal aligned loads on
51629 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51630 ISD::LoadExtType Ext = Ld->getExtensionType();
51631 unsigned Fast;
51632 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
51633 Ext == ISD::NON_EXTLOAD &&
51634 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51635 Ld->getAlign() >= Align(16)) ||
51636 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
51637 *Ld->getMemOperand(), &Fast) &&
51638 !Fast))) {
51639 unsigned NumElems = RegVT.getVectorNumElements();
51640 if (NumElems < 2)
51641 return SDValue();
51642
51643 unsigned HalfOffset = 16;
51644 SDValue Ptr1 = Ld->getBasePtr();
51645 SDValue Ptr2 =
51646 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
51647 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
51648 NumElems / 2);
51649 SDValue Load1 =
51650 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51651 Ld->getOriginalAlign(),
51652 Ld->getMemOperand()->getFlags());
51653 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51654 Ld->getPointerInfo().getWithOffset(HalfOffset),
51655 Ld->getOriginalAlign(),
51656 Ld->getMemOperand()->getFlags());
51657 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
51658 Load1.getValue(1), Load2.getValue(1));
51659
51660 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
51661 return DCI.CombineTo(N, NewVec, TF, true);
51662 }
51663
51664 // Bool vector load - attempt to cast to an integer, as we have good
51665 // (vXiY *ext(vXi1 bitcast(iX))) handling.
51666 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
51667 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
51668 unsigned NumElts = RegVT.getVectorNumElements();
51669 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51670 if (TLI.isTypeLegal(IntVT)) {
51671 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
51672 Ld->getPointerInfo(),
51673 Ld->getOriginalAlign(),
51674 Ld->getMemOperand()->getFlags());
51675 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
51676 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
51677 }
51678 }
51679
51680 // If we also broadcast this as a subvector to a wider type, then just extract
51681 // the lowest subvector.
51682 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
51683 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
51684 SDValue Ptr = Ld->getBasePtr();
51685 SDValue Chain = Ld->getChain();
51686 for (SDNode *User : Ptr->uses()) {
51687 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51688 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51689 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51690 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51691 MemVT.getSizeInBits() &&
51692 !User->hasAnyUseOfValue(1) &&
51693 User->getValueSizeInBits(0).getFixedValue() >
51694 RegVT.getFixedSizeInBits()) {
51695 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51696 RegVT.getSizeInBits());
51697 Extract = DAG.getBitcast(RegVT, Extract);
51698 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51699 }
51700 }
51701 }
51702
51703 // Cast ptr32 and ptr64 pointers to the default address space before a load.
51704 unsigned AddrSpace = Ld->getAddressSpace();
51705 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51706 AddrSpace == X86AS::PTR32_UPTR) {
51707 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51708 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
51709 SDValue Cast =
51710 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
51711 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
51712 Ld->getOriginalAlign(),
51713 Ld->getMemOperand()->getFlags());
51714 }
51715 }
51716
51717 return SDValue();
51718}
51719
51720/// If V is a build vector of boolean constants and exactly one of those
51721/// constants is true, return the operand index of that true element.
51722/// Otherwise, return -1.
51723static int getOneTrueElt(SDValue V) {
51724 // This needs to be a build vector of booleans.
51725 // TODO: Checking for the i1 type matches the IR definition for the mask,
51726 // but the mask check could be loosened to i8 or other types. That might
51727 // also require checking more than 'allOnesValue'; eg, the x86 HW
51728 // instructions only require that the MSB is set for each mask element.
51729 // The ISD::MSTORE comments/definition do not specify how the mask operand
51730 // is formatted.
51731 auto *BV = dyn_cast<BuildVectorSDNode>(V);
51732 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
51733 return -1;
51734
51735 int TrueIndex = -1;
51736 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
51737 for (unsigned i = 0; i < NumElts; ++i) {
51738 const SDValue &Op = BV->getOperand(i);
51739 if (Op.isUndef())
51740 continue;
51741 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
51742 if (!ConstNode)
51743 return -1;
51744 if (ConstNode->getAPIntValue().countr_one() >= 1) {
51745 // If we already found a one, this is too many.
51746 if (TrueIndex >= 0)
51747 return -1;
51748 TrueIndex = i;
51749 }
51750 }
51751 return TrueIndex;
51752}
51753
51754/// Given a masked memory load/store operation, return true if it has one mask
51755/// bit set. If it has one mask bit set, then also return the memory address of
51756/// the scalar element to load/store, the vector index to insert/extract that
51757/// scalar element, and the alignment for the scalar memory access.
51758static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
51759 SelectionDAG &DAG, SDValue &Addr,
51760 SDValue &Index, Align &Alignment,
51761 unsigned &Offset) {
51762 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
51763 if (TrueMaskElt < 0)
51764 return false;
51765
51766 // Get the address of the one scalar element that is specified by the mask
51767 // using the appropriate offset from the base pointer.
51768 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
51769 Offset = 0;
51770 Addr = MaskedOp->getBasePtr();
51771 if (TrueMaskElt != 0) {
51772 Offset = TrueMaskElt * EltVT.getStoreSize();
51773 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
51774 SDLoc(MaskedOp));
51775 }
51776
51777 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
51778 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
51779 EltVT.getStoreSize());
51780 return true;
51781}
51782
51783/// If exactly one element of the mask is set for a non-extending masked load,
51784/// it is a scalar load and vector insert.
51785/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51786/// mask have already been optimized in IR, so we don't bother with those here.
51787static SDValue
51788reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51789 TargetLowering::DAGCombinerInfo &DCI,
51790 const X86Subtarget &Subtarget) {
51791 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51791, __extension__
__PRETTY_FUNCTION__))
;
51792 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51793 // However, some target hooks may need to be added to know when the transform
51794 // is profitable. Endianness would also have to be considered.
51795
51796 SDValue Addr, VecIndex;
51797 Align Alignment;
51798 unsigned Offset;
51799 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
51800 return SDValue();
51801
51802 // Load the one scalar element that is specified by the mask using the
51803 // appropriate offset from the base pointer.
51804 SDLoc DL(ML);
51805 EVT VT = ML->getValueType(0);
51806 EVT EltVT = VT.getVectorElementType();
51807
51808 EVT CastVT = VT;
51809 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51810 EltVT = MVT::f64;
51811 CastVT = VT.changeVectorElementType(EltVT);
51812 }
51813
51814 SDValue Load =
51815 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
51816 ML->getPointerInfo().getWithOffset(Offset),
51817 Alignment, ML->getMemOperand()->getFlags());
51818
51819 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
51820
51821 // Insert the loaded element into the appropriate place in the vector.
51822 SDValue Insert =
51823 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
51824 Insert = DAG.getBitcast(VT, Insert);
51825 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
51826}
51827
51828static SDValue
51829combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51830 TargetLowering::DAGCombinerInfo &DCI) {
51831 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51831, __extension__
__PRETTY_FUNCTION__))
;
51832 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
51833 return SDValue();
51834
51835 SDLoc DL(ML);
51836 EVT VT = ML->getValueType(0);
51837
51838 // If we are loading the first and last elements of a vector, it is safe and
51839 // always faster to load the whole vector. Replace the masked load with a
51840 // vector load and select.
51841 unsigned NumElts = VT.getVectorNumElements();
51842 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
51843 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
51844 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
51845 if (LoadFirstElt && LoadLastElt) {
51846 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
51847 ML->getMemOperand());
51848 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
51849 ML->getPassThru());
51850 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
51851 }
51852
51853 // Convert a masked load with a constant mask into a masked load and a select.
51854 // This allows the select operation to use a faster kind of select instruction
51855 // (for example, vblendvps -> vblendps).
51856
51857 // Don't try this if the pass-through operand is already undefined. That would
51858 // cause an infinite loop because that's what we're about to create.
51859 if (ML->getPassThru().isUndef())
51860 return SDValue();
51861
51862 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
51863 return SDValue();
51864
51865 // The new masked load has an undef pass-through operand. The select uses the
51866 // original pass-through operand.
51867 SDValue NewML = DAG.getMaskedLoad(
51868 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
51869 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
51870 ML->getAddressingMode(), ML->getExtensionType());
51871 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
51872 ML->getPassThru());
51873
51874 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
51875}
51876
51877static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
51878 TargetLowering::DAGCombinerInfo &DCI,
51879 const X86Subtarget &Subtarget) {
51880 auto *Mld = cast<MaskedLoadSDNode>(N);
51881
51882 // TODO: Expanding load with constant mask may be optimized as well.
51883 if (Mld->isExpandingLoad())
51884 return SDValue();
51885
51886 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
51887 if (SDValue ScalarLoad =
51888 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
51889 return ScalarLoad;
51890
51891 // TODO: Do some AVX512 subsets benefit from this transform?
51892 if (!Subtarget.hasAVX512())
51893 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
51894 return Blend;
51895 }
51896
51897 // If the mask value has been legalized to a non-boolean vector, try to
51898 // simplify ops leading up to it. We only demand the MSB of each lane.
51899 SDValue Mask = Mld->getMask();
51900 if (Mask.getScalarValueSizeInBits() != 1) {
51901 EVT VT = Mld->getValueType(0);
51902 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51903 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51904 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51905 if (N->getOpcode() != ISD::DELETED_NODE)
51906 DCI.AddToWorklist(N);
51907 return SDValue(N, 0);
51908 }
51909 if (SDValue NewMask =
51910 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51911 return DAG.getMaskedLoad(
51912 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
51913 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
51914 Mld->getAddressingMode(), Mld->getExtensionType());
51915 }
51916
51917 return SDValue();
51918}
51919
51920/// If exactly one element of the mask is set for a non-truncating masked store,
51921/// it is a vector extract and scalar store.
51922/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51923/// mask have already been optimized in IR, so we don't bother with those here.
51924static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
51925 SelectionDAG &DAG,
51926 const X86Subtarget &Subtarget) {
51927 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51928 // However, some target hooks may need to be added to know when the transform
51929 // is profitable. Endianness would also have to be considered.
51930
51931 SDValue Addr, VecIndex;
51932 Align Alignment;
51933 unsigned Offset;
51934 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
51935 return SDValue();
51936
51937 // Extract the one scalar element that is actually being stored.
51938 SDLoc DL(MS);
51939 SDValue Value = MS->getValue();
51940 EVT VT = Value.getValueType();
51941 EVT EltVT = VT.getVectorElementType();
51942 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51943 EltVT = MVT::f64;
51944 EVT CastVT = VT.changeVectorElementType(EltVT);
51945 Value = DAG.getBitcast(CastVT, Value);
51946 }
51947 SDValue Extract =
51948 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
51949
51950 // Store that element at the appropriate offset from the base pointer.
51951 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
51952 MS->getPointerInfo().getWithOffset(Offset),
51953 Alignment, MS->getMemOperand()->getFlags());
51954}
51955
51956static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
51957 TargetLowering::DAGCombinerInfo &DCI,
51958 const X86Subtarget &Subtarget) {
51959 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
51960 if (Mst->isCompressingStore())
51961 return SDValue();
51962
51963 EVT VT = Mst->getValue().getValueType();
51964 SDLoc dl(Mst);
51965 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51966
51967 if (Mst->isTruncatingStore())
51968 return SDValue();
51969
51970 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
51971 return ScalarStore;
51972
51973 // If the mask value has been legalized to a non-boolean vector, try to
51974 // simplify ops leading up to it. We only demand the MSB of each lane.
51975 SDValue Mask = Mst->getMask();
51976 if (Mask.getScalarValueSizeInBits() != 1) {
51977 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51978 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51979 if (N->getOpcode() != ISD::DELETED_NODE)
51980 DCI.AddToWorklist(N);
51981 return SDValue(N, 0);
51982 }
51983 if (SDValue NewMask =
51984 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51985 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51986 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51987 Mst->getMemoryVT(), Mst->getMemOperand(),
51988 Mst->getAddressingMode());
51989 }
51990
51991 SDValue Value = Mst->getValue();
51992 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51993 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
51994 Mst->getMemoryVT())) {
51995 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51996 Mst->getBasePtr(), Mst->getOffset(), Mask,
51997 Mst->getMemoryVT(), Mst->getMemOperand(),
51998 Mst->getAddressingMode(), true);
51999 }
52000
52001 return SDValue();
52002}
52003
52004static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
52005 TargetLowering::DAGCombinerInfo &DCI,
52006 const X86Subtarget &Subtarget) {
52007 StoreSDNode *St = cast<StoreSDNode>(N);
52008 EVT StVT = St->getMemoryVT();
52009 SDLoc dl(St);
52010 SDValue StoredVal = St->getValue();
52011 EVT VT = StoredVal.getValueType();
52012 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52013
52014 // Convert a store of vXi1 into a store of iX and a bitcast.
52015 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
52016 VT.getVectorElementType() == MVT::i1) {
52017
52018 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
52019 StoredVal = DAG.getBitcast(NewVT, StoredVal);
52020
52021 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52022 St->getPointerInfo(), St->getOriginalAlign(),
52023 St->getMemOperand()->getFlags());
52024 }
52025
52026 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
52027 // This will avoid a copy to k-register.
52028 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
52029 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
52030 StoredVal.getOperand(0).getValueType() == MVT::i8) {
52031 SDValue Val = StoredVal.getOperand(0);
52032 // We must store zeros to the unused bits.
52033 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
52034 return DAG.getStore(St->getChain(), dl, Val,
52035 St->getBasePtr(), St->getPointerInfo(),
52036 St->getOriginalAlign(),
52037 St->getMemOperand()->getFlags());
52038 }
52039
52040 // Widen v2i1/v4i1 stores to v8i1.
52041 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
52042 Subtarget.hasAVX512()) {
52043 unsigned NumConcats = 8 / VT.getVectorNumElements();
52044 // We must store zeros to the unused bits.
52045 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
52046 Ops[0] = StoredVal;
52047 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
52048 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52049 St->getPointerInfo(), St->getOriginalAlign(),
52050 St->getMemOperand()->getFlags());
52051 }
52052
52053 // Turn vXi1 stores of constants into a scalar store.
52054 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
52055 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
52056 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
52057 // If its a v64i1 store without 64-bit support, we need two stores.
52058 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
52059 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
52060 StoredVal->ops().slice(0, 32));
52061 Lo = combinevXi1ConstantToInteger(Lo, DAG);
52062 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
52063 StoredVal->ops().slice(32, 32));
52064 Hi = combinevXi1ConstantToInteger(Hi, DAG);
52065
52066 SDValue Ptr0 = St->getBasePtr();
52067 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
52068
52069 SDValue Ch0 =
52070 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52071 St->getOriginalAlign(),
52072 St->getMemOperand()->getFlags());
52073 SDValue Ch1 =
52074 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52075 St->getPointerInfo().getWithOffset(4),
52076 St->getOriginalAlign(),
52077 St->getMemOperand()->getFlags());
52078 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
52079 }
52080
52081 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
52082 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52083 St->getPointerInfo(), St->getOriginalAlign(),
52084 St->getMemOperand()->getFlags());
52085 }
52086
52087 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52088 // Sandy Bridge, perform two 16-byte stores.
52089 unsigned Fast;
52090 if (VT.is256BitVector() && StVT == VT &&
52091 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
52092 *St->getMemOperand(), &Fast) &&
52093 !Fast) {
52094 unsigned NumElems = VT.getVectorNumElements();
52095 if (NumElems < 2)
52096 return SDValue();
52097
52098 return splitVectorStore(St, DAG);
52099 }
52100
52101 // Split under-aligned vector non-temporal stores.
52102 if (St->isNonTemporal() && StVT == VT &&
52103 St->getAlign().value() < VT.getStoreSize()) {
52104 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52105 // vectors or the legalizer can scalarize it to use MOVNTI.
52106 if (VT.is256BitVector() || VT.is512BitVector()) {
52107 unsigned NumElems = VT.getVectorNumElements();
52108 if (NumElems < 2)
52109 return SDValue();
52110 return splitVectorStore(St, DAG);
52111 }
52112
52113 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52114 // to use MOVNTI.
52115 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
52116 MVT NTVT = Subtarget.hasSSE4A()
52117 ? MVT::v2f64
52118 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
52119 return scalarizeVectorStore(St, NTVT, DAG);
52120 }
52121 }
52122
52123 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52124 // supported, but avx512f is by extending to v16i32 and truncating.
52125 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52126 St->getValue().getOpcode() == ISD::TRUNCATE &&
52127 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
52128 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
52129 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
52130 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
52131 St->getValue().getOperand(0));
52132 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
52133 MVT::v16i8, St->getMemOperand());
52134 }
52135
52136 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
52137 if (!St->isTruncatingStore() &&
52138 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
52139 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
52140 StoredVal.hasOneUse() &&
52141 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
52142 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
52143 return EmitTruncSStore(IsSigned, St->getChain(),
52144 dl, StoredVal.getOperand(0), St->getBasePtr(),
52145 VT, St->getMemOperand(), DAG);
52146 }
52147
52148 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
52149 if (!St->isTruncatingStore()) {
52150 auto IsExtractedElement = [](SDValue V) {
52151 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
52152 V = V.getOperand(0);
52153 unsigned Opc = V.getOpcode();
52154 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
52155 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
52156 V.getOperand(0).hasOneUse())
52157 return V.getOperand(0);
52158 return SDValue();
52159 };
52160 if (SDValue Extract = IsExtractedElement(StoredVal)) {
52161 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
52162 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
52163 SDValue Src = Trunc.getOperand(0);
52164 MVT DstVT = Trunc.getSimpleValueType();
52165 MVT SrcVT = Src.getSimpleValueType();
52166 unsigned NumSrcElts = SrcVT.getVectorNumElements();
52167 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
52168 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
52169 if (NumTruncBits == VT.getSizeInBits() &&
52170 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
52171 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
52172 TruncVT, St->getMemOperand());
52173 }
52174 }
52175 }
52176 }
52177
52178 // Optimize trunc store (of multiple scalars) to shuffle and store.
52179 // First, pack all of the elements in one place. Next, store to memory
52180 // in fewer chunks.
52181 if (St->isTruncatingStore() && VT.isVector()) {
52182 // Check if we can detect an AVG pattern from the truncation. If yes,
52183 // replace the trunc store by a normal store with the result of X86ISD::AVG
52184 // instruction.
52185 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
52186 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
52187 Subtarget, dl))
52188 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
52189 St->getPointerInfo(), St->getOriginalAlign(),
52190 St->getMemOperand()->getFlags());
52191
52192 if (TLI.isTruncStoreLegal(VT, StVT)) {
52193 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
52194 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
52195 dl, Val, St->getBasePtr(),
52196 St->getMemoryVT(), St->getMemOperand(), DAG);
52197 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
52198 DAG, dl))
52199 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
52200 dl, Val, St->getBasePtr(),
52201 St->getMemoryVT(), St->getMemOperand(), DAG);
52202 }
52203
52204 return SDValue();
52205 }
52206
52207 // Cast ptr32 and ptr64 pointers to the default address space before a store.
52208 unsigned AddrSpace = St->getAddressSpace();
52209 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
52210 AddrSpace == X86AS::PTR32_UPTR) {
52211 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
52212 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
52213 SDValue Cast =
52214 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
52215 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
52216 St->getPointerInfo(), St->getOriginalAlign(),
52217 St->getMemOperand()->getFlags(), St->getAAInfo());
52218 }
52219 }
52220
52221 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
52222 // the FP state in cases where an emms may be missing.
52223 // A preferable solution to the general problem is to figure out the right
52224 // places to insert EMMS. This qualifies as a quick hack.
52225
52226 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
52227 if (VT.getSizeInBits() != 64)
52228 return SDValue();
52229
52230 const Function &F = DAG.getMachineFunction().getFunction();
52231 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
52232 bool F64IsLegal =
52233 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
52234 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
52235 isa<LoadSDNode>(St->getValue()) &&
52236 cast<LoadSDNode>(St->getValue())->isSimple() &&
52237 St->getChain().hasOneUse() && St->isSimple()) {
52238 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
52239
52240 if (!ISD::isNormalLoad(Ld))
52241 return SDValue();
52242
52243 // Avoid the transformation if there are multiple uses of the loaded value.
52244 if (!Ld->hasNUsesOfValue(1, 0))
52245 return SDValue();
52246
52247 SDLoc LdDL(Ld);
52248 SDLoc StDL(N);
52249 // Lower to a single movq load/store pair.
52250 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
52251 Ld->getBasePtr(), Ld->getMemOperand());
52252
52253 // Make sure new load is placed in same chain order.
52254 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
52255 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
52256 St->getMemOperand());
52257 }
52258
52259 // This is similar to the above case, but here we handle a scalar 64-bit
52260 // integer store that is extracted from a vector on a 32-bit target.
52261 // If we have SSE2, then we can treat it like a floating-point double
52262 // to get past legalization. The execution dependencies fixup pass will
52263 // choose the optimal machine instruction for the store if this really is
52264 // an integer or v2f32 rather than an f64.
52265 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
52266 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
52267 SDValue OldExtract = St->getOperand(1);
52268 SDValue ExtOp0 = OldExtract.getOperand(0);
52269 unsigned VecSize = ExtOp0.getValueSizeInBits();
52270 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
52271 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
52272 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
52273 BitCast, OldExtract.getOperand(1));
52274 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
52275 St->getPointerInfo(), St->getOriginalAlign(),
52276 St->getMemOperand()->getFlags());
52277 }
52278
52279 return SDValue();
52280}
52281
52282static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
52283 TargetLowering::DAGCombinerInfo &DCI,
52284 const X86Subtarget &Subtarget) {
52285 auto *St = cast<MemIntrinsicSDNode>(N);
52286
52287 SDValue StoredVal = N->getOperand(1);
52288 MVT VT = StoredVal.getSimpleValueType();
52289 EVT MemVT = St->getMemoryVT();
52290
52291 // Figure out which elements we demand.
52292 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
52293 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
52294
52295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52296 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
52297 if (N->getOpcode() != ISD::DELETED_NODE)
52298 DCI.AddToWorklist(N);
52299 return SDValue(N, 0);
52300 }
52301
52302 return SDValue();
52303}
52304
52305/// Return 'true' if this vector operation is "horizontal"
52306/// and return the operands for the horizontal operation in LHS and RHS. A
52307/// horizontal operation performs the binary operation on successive elements
52308/// of its first operand, then on successive elements of its second operand,
52309/// returning the resulting values in a vector. For example, if
52310/// A = < float a0, float a1, float a2, float a3 >
52311/// and
52312/// B = < float b0, float b1, float b2, float b3 >
52313/// then the result of doing a horizontal operation on A and B is
52314/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
52315/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
52316/// A horizontal-op B, for some already available A and B, and if so then LHS is
52317/// set to A, RHS to B, and the routine returns 'true'.
52318static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
52319 SelectionDAG &DAG, const X86Subtarget &Subtarget,
52320 bool IsCommutative,
52321 SmallVectorImpl<int> &PostShuffleMask) {
52322 // If either operand is undef, bail out. The binop should be simplified.
52323 if (LHS.isUndef() || RHS.isUndef())
52324 return false;
52325
52326 // Look for the following pattern:
52327 // A = < float a0, float a1, float a2, float a3 >
52328 // B = < float b0, float b1, float b2, float b3 >
52329 // and
52330 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
52331 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
52332 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
52333 // which is A horizontal-op B.
52334
52335 MVT VT = LHS.getSimpleValueType();
52336 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__
__PRETTY_FUNCTION__))
52337 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__
__PRETTY_FUNCTION__))
;
52338 unsigned NumElts = VT.getVectorNumElements();
52339
52340 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
52341 SmallVectorImpl<int> &ShuffleMask) {
52342 bool UseSubVector = false;
52343 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
52344 Op.getOperand(0).getValueType().is256BitVector() &&
52345 llvm::isNullConstant(Op.getOperand(1))) {
52346 Op = Op.getOperand(0);
52347 UseSubVector = true;
52348 }
52349 SmallVector<SDValue, 2> SrcOps;
52350 SmallVector<int, 16> SrcMask, ScaledMask;
52351 SDValue BC = peekThroughBitcasts(Op);
52352 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
52353 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
52354 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
52355 })) {
52356 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
52357 if (!UseSubVector && SrcOps.size() <= 2 &&
52358 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
52359 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
52360 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
52361 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
52362 }
52363 if (UseSubVector && SrcOps.size() == 1 &&
52364 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
52365 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
52366 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
52367 ShuffleMask.assign(Mask.begin(), Mask.end());
52368 }
52369 }
52370 };
52371
52372 // View LHS in the form
52373 // LHS = VECTOR_SHUFFLE A, B, LMask
52374 // If LHS is not a shuffle, then pretend it is the identity shuffle:
52375 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
52376 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
52377 SDValue A, B;
52378 SmallVector<int, 16> LMask;
52379 GetShuffle(LHS, A, B, LMask);
52380
52381 // Likewise, view RHS in the form
52382 // RHS = VECTOR_SHUFFLE C, D, RMask
52383 SDValue C, D;
52384 SmallVector<int, 16> RMask;
52385 GetShuffle(RHS, C, D, RMask);
52386
52387 // At least one of the operands should be a vector shuffle.
52388 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
52389 if (NumShuffles == 0)
52390 return false;
52391
52392 if (LMask.empty()) {
52393 A = LHS;
52394 for (unsigned i = 0; i != NumElts; ++i)
52395 LMask.push_back(i);
52396 }
52397
52398 if (RMask.empty()) {
52399 C = RHS;
52400 for (unsigned i = 0; i != NumElts; ++i)
52401 RMask.push_back(i);
52402 }
52403
52404 // If we have an unary mask, ensure the other op is set to null.
52405 if (isUndefOrInRange(LMask, 0, NumElts))
52406 B = SDValue();
52407 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
52408 A = SDValue();
52409
52410 if (isUndefOrInRange(RMask, 0, NumElts))
52411 D = SDValue();
52412 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
52413 C = SDValue();
52414
52415 // If A and B occur in reverse order in RHS, then canonicalize by commuting
52416 // RHS operands and shuffle mask.
52417 if (A != C) {
52418 std::swap(C, D);
52419 ShuffleVectorSDNode::commuteMask(RMask);
52420 }
52421 // Check that the shuffles are both shuffling the same vectors.
52422 if (!(A == C && B == D))
52423 return false;
52424
52425 PostShuffleMask.clear();
52426 PostShuffleMask.append(NumElts, SM_SentinelUndef);
52427
52428 // LHS and RHS are now:
52429 // LHS = shuffle A, B, LMask
52430 // RHS = shuffle A, B, RMask
52431 // Check that the masks correspond to performing a horizontal operation.
52432 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
52433 // so we just repeat the inner loop if this is a 256-bit op.
52434 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
52435 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
52436 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
52437 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__
__PRETTY_FUNCTION__))
52438 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__
__PRETTY_FUNCTION__))
;
52439 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
52440 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
52441 // Ignore undefined components.
52442 int LIdx = LMask[i + j], RIdx = RMask[i + j];
52443 if (LIdx < 0 || RIdx < 0 ||
52444 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
52445 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
52446 continue;
52447
52448 // Check that successive odd/even elements are being operated on. If not,
52449 // this is not a horizontal operation.
52450 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
52451 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
52452 return false;
52453
52454 // Compute the post-shuffle mask index based on where the element
52455 // is stored in the HOP result, and where it needs to be moved to.
52456 int Base = LIdx & ~1u;
52457 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
52458 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
52459
52460 // The low half of the 128-bit result must choose from A.
52461 // The high half of the 128-bit result must choose from B,
52462 // unless B is undef. In that case, we are always choosing from A.
52463 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
52464 Index += NumEltsPer64BitChunk;
52465 PostShuffleMask[i + j] = Index;
52466 }
52467 }
52468
52469 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
52470 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
52471
52472 bool IsIdentityPostShuffle =
52473 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
52474 if (IsIdentityPostShuffle)
52475 PostShuffleMask.clear();
52476
52477 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
52478 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
52479 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
52480 return false;
52481
52482 // If the source nodes are already used in HorizOps then always accept this.
52483 // Shuffle folding should merge these back together.
52484 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
52485 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52486 });
52487 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
52488 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52489 });
52490 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
52491
52492 // Assume a SingleSource HOP if we only shuffle one input and don't need to
52493 // shuffle the result.
52494 if (!ForceHorizOp &&
52495 !shouldUseHorizontalOp(NewLHS == NewRHS &&
52496 (NumShuffles < 2 || !IsIdentityPostShuffle),
52497 DAG, Subtarget))
52498 return false;
52499
52500 LHS = DAG.getBitcast(VT, NewLHS);
52501 RHS = DAG.getBitcast(VT, NewRHS);
52502 return true;
52503}
52504
52505// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
52506static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
52507 const X86Subtarget &Subtarget) {
52508 EVT VT = N->getValueType(0);
52509 unsigned Opcode = N->getOpcode();
52510 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
52511 SmallVector<int, 8> PostShuffleMask;
52512
52513 switch (Opcode) {
52514 case ISD::FADD:
52515 case ISD::FSUB:
52516 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
52517 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
52518 SDValue LHS = N->getOperand(0);
52519 SDValue RHS = N->getOperand(1);
52520 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
52521 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52522 PostShuffleMask)) {
52523 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
52524 if (!PostShuffleMask.empty())
52525 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52526 DAG.getUNDEF(VT), PostShuffleMask);
52527 return HorizBinOp;
52528 }
52529 }
52530 break;
52531 case ISD::ADD:
52532 case ISD::SUB:
52533 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
52534 VT == MVT::v16i16 || VT == MVT::v8i32)) {
52535 SDValue LHS = N->getOperand(0);
52536 SDValue RHS = N->getOperand(1);
52537 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
52538 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52539 PostShuffleMask)) {
52540 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
52541 ArrayRef<SDValue> Ops) {
52542 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
52543 };
52544 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
52545 {LHS, RHS}, HOpBuilder);
52546 if (!PostShuffleMask.empty())
52547 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52548 DAG.getUNDEF(VT), PostShuffleMask);
52549 return HorizBinOp;
52550 }
52551 }
52552 break;
52553 }
52554
52555 return SDValue();
52556}
52557
52558// Try to combine the following nodes
52559// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
52560// <i32 -2147483648[float -0.000000e+00]> 0
52561// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
52562// <(load 4 from constant-pool)> t0, t29
52563// [t30: v16i32 = bitcast t27]
52564// t6: v16i32 = xor t7, t27[t30]
52565// t11: v16f32 = bitcast t6
52566// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
52567// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
52568// t22: v16f32 = bitcast t7
52569// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
52570// t24: v32f16 = bitcast t23
52571static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
52572 const X86Subtarget &Subtarget) {
52573 EVT VT = N->getValueType(0);
52574 SDValue LHS = N->getOperand(0);
52575 SDValue RHS = N->getOperand(1);
52576 int CombineOpcode =
52577 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
52578 auto isConjugationConstant = [](const Constant *c) {
52579 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
52580 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
52581 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
52582 switch (CI->getBitWidth()) {
52583 case 16:
52584 return false;
52585 case 32:
52586 return CI->getValue() == ConjugationInt32;
52587 case 64:
52588 return CI->getValue() == ConjugationInt64;
52589 default:
52590 llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52590)
;
52591 }
52592 }
52593 if (const auto *CF = dyn_cast<ConstantFP>(c))
52594 return CF->isNegativeZeroValue();
52595 return false;
52596 };
52597 auto combineConjugation = [&](SDValue &r) {
52598 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
52599 SDValue XOR = LHS.getOperand(0);
52600 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
52601 SDValue XORRHS = XOR.getOperand(1);
52602 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
52603 XORRHS = XORRHS.getOperand(0);
52604 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
52605 XORRHS.getOperand(1).getNumOperands()) {
52606 ConstantPoolSDNode *CP =
52607 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
52608 if (CP && isConjugationConstant(CP->getConstVal())) {
52609 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
52610 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
52611 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
52612 r = DAG.getBitcast(VT, FCMulC);
52613 return true;
52614 }
52615 }
52616 }
52617 }
52618 return false;
52619 };
52620 SDValue Res;
52621 if (combineConjugation(Res))
52622 return Res;
52623 std::swap(LHS, RHS);
52624 if (combineConjugation(Res))
52625 return Res;
52626 return Res;
52627}
52628
52629// Try to combine the following nodes:
52630// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
52631static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
52632 const X86Subtarget &Subtarget) {
52633 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
52634 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
52635 Flags.hasAllowContract();
52636 };
52637
52638 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
52639 return DAG.getTarget().Options.NoSignedZerosFPMath ||
52640 Flags.hasNoSignedZeros();
52641 };
52642 auto IsVectorAllNegativeZero = [](const SDNode *N) {
52643 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
52644 return false;
52645 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__
__PRETTY_FUNCTION__))
52646 "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__
__PRETTY_FUNCTION__))
;
52647 if (ConstantPoolSDNode *CP =
52648 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
52649 APInt AI = APInt(32, 0x80008000, true);
52650 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
52651 return CI->getValue() == AI;
52652 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
52653 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
52654 }
52655 return false;
52656 };
52657
52658 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
52659 !AllowContract(N->getFlags()))
52660 return SDValue();
52661
52662 EVT VT = N->getValueType(0);
52663 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
52664 return SDValue();
52665
52666 SDValue LHS = N->getOperand(0);
52667 SDValue RHS = N->getOperand(1);
52668 bool IsConj;
52669 SDValue FAddOp1, MulOp0, MulOp1;
52670 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
52671 &IsVectorAllNegativeZero,
52672 &HasNoSignedZero](SDValue N) -> bool {
52673 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
52674 return false;
52675 SDValue Op0 = N.getOperand(0);
52676 unsigned Opcode = Op0.getOpcode();
52677 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
52678 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
52679 MulOp0 = Op0.getOperand(0);
52680 MulOp1 = Op0.getOperand(1);
52681 IsConj = Opcode == X86ISD::VFCMULC;
52682 return true;
52683 }
52684 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
52685 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
52686 HasNoSignedZero(Op0->getFlags())) ||
52687 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
52688 MulOp0 = Op0.getOperand(0);
52689 MulOp1 = Op0.getOperand(1);
52690 IsConj = Opcode == X86ISD::VFCMADDC;
52691 return true;
52692 }
52693 }
52694 return false;
52695 };
52696
52697 if (GetCFmulFrom(LHS))
52698 FAddOp1 = RHS;
52699 else if (GetCFmulFrom(RHS))
52700 FAddOp1 = LHS;
52701 else
52702 return SDValue();
52703
52704 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
52705 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
52706 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
52707 // FIXME: How do we handle when fast math flags of FADD are different from
52708 // CFMUL's?
52709 SDValue CFmul =
52710 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
52711 return DAG.getBitcast(VT, CFmul);
52712}
52713
52714/// Do target-specific dag combines on floating-point adds/subs.
52715static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
52716 const X86Subtarget &Subtarget) {
52717 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
52718 return HOp;
52719
52720 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
52721 return COp;
52722
52723 return SDValue();
52724}
52725
52726/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
52727/// the codegen.
52728/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
52729/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
52730/// anything that is guaranteed to be transformed by DAGCombiner.
52731static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
52732 const X86Subtarget &Subtarget,
52733 const SDLoc &DL) {
52734 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52734, __extension__
__PRETTY_FUNCTION__))
;
52735 SDValue Src = N->getOperand(0);
52736 unsigned SrcOpcode = Src.getOpcode();
52737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52738
52739 EVT VT = N->getValueType(0);
52740 EVT SrcVT = Src.getValueType();
52741
52742 auto IsFreeTruncation = [VT](SDValue Op) {
52743 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
52744
52745 // See if this has been extended from a smaller/equal size to
52746 // the truncation size, allowing a truncation to combine with the extend.
52747 unsigned Opcode = Op.getOpcode();
52748 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
52749 Opcode == ISD::ZERO_EXTEND) &&
52750 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
52751 return true;
52752
52753 // See if this is a single use constant which can be constant folded.
52754 // NOTE: We don't peek throught bitcasts here because there is currently
52755 // no support for constant folding truncate+bitcast+vector_of_constants. So
52756 // we'll just send up with a truncate on both operands which will
52757 // get turned back into (truncate (binop)) causing an infinite loop.
52758 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
52759 };
52760
52761 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
52762 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
52763 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
52764 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
52765 };
52766
52767 // Don't combine if the operation has other uses.
52768 if (!Src.hasOneUse())
52769 return SDValue();
52770
52771 // Only support vector truncation for now.
52772 // TODO: i64 scalar math would benefit as well.
52773 if (!VT.isVector())
52774 return SDValue();
52775
52776 // In most cases its only worth pre-truncating if we're only facing the cost
52777 // of one truncation.
52778 // i.e. if one of the inputs will constant fold or the input is repeated.
52779 switch (SrcOpcode) {
52780 case ISD::MUL:
52781 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
52782 // better to truncate if we have the chance.
52783 if (SrcVT.getScalarType() == MVT::i64 &&
52784 TLI.isOperationLegal(SrcOpcode, VT) &&
52785 !TLI.isOperationLegal(SrcOpcode, SrcVT))
52786 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
52787 [[fallthrough]];
52788 case ISD::AND:
52789 case ISD::XOR:
52790 case ISD::OR:
52791 case ISD::ADD:
52792 case ISD::SUB: {
52793 SDValue Op0 = Src.getOperand(0);
52794 SDValue Op1 = Src.getOperand(1);
52795 if (TLI.isOperationLegal(SrcOpcode, VT) &&
52796 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
52797 return TruncateArithmetic(Op0, Op1);
52798 break;
52799 }
52800 }
52801
52802 return SDValue();
52803}
52804
52805/// Truncate using ISD::AND mask and X86ISD::PACKUS.
52806/// e.g. trunc <8 x i32> X to <8 x i16> -->
52807/// MaskX = X & 0xffff (clear high bits to prevent saturation)
52808/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
52809static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
52810 const X86Subtarget &Subtarget,
52811 SelectionDAG &DAG) {
52812 SDValue In = N->getOperand(0);
52813 EVT InVT = In.getValueType();
52814 EVT OutVT = N->getValueType(0);
52815
52816 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
52817 OutVT.getScalarSizeInBits());
52818 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
52819 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
52820}
52821
52822/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
52823static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
52824 const X86Subtarget &Subtarget,
52825 SelectionDAG &DAG) {
52826 SDValue In = N->getOperand(0);
52827 EVT InVT = In.getValueType();
52828 EVT OutVT = N->getValueType(0);
52829 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
52830 DAG.getValueType(OutVT));
52831 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
52832}
52833
52834/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
52835/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
52836/// legalization the truncation will be translated into a BUILD_VECTOR with each
52837/// element that is extracted from a vector and then truncated, and it is
52838/// difficult to do this optimization based on them.
52839static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
52840 const X86Subtarget &Subtarget) {
52841 EVT OutVT = N->getValueType(0);
52842 if (!OutVT.isVector())
52843 return SDValue();
52844
52845 SDValue In = N->getOperand(0);
52846 if (!In.getValueType().isSimple())
52847 return SDValue();
52848
52849 EVT InVT = In.getValueType();
52850 unsigned NumElems = OutVT.getVectorNumElements();
52851
52852 // AVX512 provides fast truncate ops.
52853 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
52854 return SDValue();
52855
52856 EVT OutSVT = OutVT.getVectorElementType();
52857 EVT InSVT = InVT.getVectorElementType();
52858 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
52859 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
52860 NumElems >= 8))
52861 return SDValue();
52862
52863 // SSSE3's pshufb results in less instructions in the cases below.
52864 if (Subtarget.hasSSSE3() && NumElems == 8) {
52865 if (InSVT == MVT::i16)
52866 return SDValue();
52867 if (InSVT == MVT::i32 &&
52868 (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
52869 return SDValue();
52870 }
52871
52872 SDLoc DL(N);
52873 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
52874 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
52875 // truncate 2 x v4i32 to v8i16.
52876 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
52877 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
52878 if (InSVT == MVT::i32)
52879 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
52880
52881 return SDValue();
52882}
52883
52884/// This function transforms vector truncation of 'extended sign-bits' or
52885/// 'extended zero-bits' values.
52886/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
52887static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
52888 SelectionDAG &DAG,
52889 const X86Subtarget &Subtarget) {
52890 // Requires SSE2.
52891 if (!Subtarget.hasSSE2())
52892 return SDValue();
52893
52894 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
52895 return SDValue();
52896
52897 SDValue In = N->getOperand(0);
52898 if (!In.getValueType().isSimple())
52899 return SDValue();
52900
52901 MVT VT = N->getValueType(0).getSimpleVT();
52902 MVT SVT = VT.getScalarType();
52903
52904 MVT InVT = In.getValueType().getSimpleVT();
52905 MVT InSVT = InVT.getScalarType();
52906
52907 // Check we have a truncation suited for PACKSS/PACKUS.
52908 if (!isPowerOf2_32(VT.getVectorNumElements()))
52909 return SDValue();
52910 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
52911 return SDValue();
52912 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
52913 return SDValue();
52914
52915 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
52916 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
52917 return SDValue();
52918
52919 // AVX512 has fast truncate, but if the input is already going to be split,
52920 // there's no harm in trying pack.
52921 if (Subtarget.hasAVX512() &&
52922 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
52923 InVT.is512BitVector())) {
52924 // PACK should still be worth it for 128-bit vectors if the sources were
52925 // originally concatenated from subvectors.
52926 SmallVector<SDValue> ConcatOps;
52927 if (VT.getSizeInBits() > 128 ||
52928 !collectConcatOps(In.getNode(), ConcatOps, DAG))
52929 return SDValue();
52930 }
52931
52932 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
52933 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
52934
52935 // Use PACKUS if the input has zero-bits that extend all the way to the
52936 // packed/truncated value. e.g. masks, zext_in_reg, etc.
52937 KnownBits Known = DAG.computeKnownBits(In);
52938 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
52939 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
52940 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
52941
52942 // Use PACKSS if the input has sign-bits that extend all the way to the
52943 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
52944 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
52945
52946 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
52947 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
52948 // on and combines/simplifications can't then use it.
52949 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
52950 return SDValue();
52951
52952 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
52953 if (NumSignBits > MinSignBits)
52954 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
52955
52956 // If we have a srl that only generates signbits that we will discard in
52957 // the truncation then we can use PACKSS by converting the srl to a sra.
52958 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
52959 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
52960 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
52961 In, APInt::getAllOnes(VT.getVectorNumElements()))) {
52962 if (*ShAmt == MinSignBits) {
52963 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
52964 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
52965 Subtarget);
52966 }
52967 }
52968
52969 return SDValue();
52970}
52971
52972// Try to form a MULHU or MULHS node by looking for
52973// (trunc (srl (mul ext, ext), 16))
52974// TODO: This is X86 specific because we want to be able to handle wide types
52975// before type legalization. But we can only do it if the vector will be
52976// legalized via widening/splitting. Type legalization can't handle promotion
52977// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
52978// combiner.
52979static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
52980 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
52981 // First instruction should be a right shift of a multiply.
52982 if (Src.getOpcode() != ISD::SRL ||
52983 Src.getOperand(0).getOpcode() != ISD::MUL)
52984 return SDValue();
52985
52986 if (!Subtarget.hasSSE2())
52987 return SDValue();
52988
52989 // Only handle vXi16 types that are at least 128-bits unless they will be
52990 // widened.
52991 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
52992 return SDValue();
52993
52994 // Input type should be at least vXi32.
52995 EVT InVT = Src.getValueType();
52996 if (InVT.getVectorElementType().getSizeInBits() < 32)
52997 return SDValue();
52998
52999 // Need a shift by 16.
53000 APInt ShiftAmt;
53001 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
53002 ShiftAmt != 16)
53003 return SDValue();
53004
53005 SDValue LHS = Src.getOperand(0).getOperand(0);
53006 SDValue RHS = Src.getOperand(0).getOperand(1);
53007
53008 // Count leading sign/zero bits on both inputs - if there are enough then
53009 // truncation back to vXi16 will be cheap - either as a pack/shuffle
53010 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
53011 // truncations may actually be free by peeking through to the ext source.
53012 auto IsSext = [&DAG](SDValue V) {
53013 return DAG.ComputeMaxSignificantBits(V) <= 16;
53014 };
53015 auto IsZext = [&DAG](SDValue V) {
53016 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
53017 };
53018
53019 bool IsSigned = IsSext(LHS) && IsSext(RHS);
53020 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
53021 if (!IsSigned && !IsUnsigned)
53022 return SDValue();
53023
53024 // Check if both inputs are extensions, which will be removed by truncation.
53025 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
53026 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
53027 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
53028 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
53029 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
53030 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
53031
53032 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
53033 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
53034 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
53035 // will have to split anyway.
53036 unsigned InSizeInBits = InVT.getSizeInBits();
53037 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
53038 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
53039 (InSizeInBits % 16) == 0) {
53040 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53041 InVT.getSizeInBits() / 16);
53042 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
53043 DAG.getBitcast(BCVT, RHS));
53044 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
53045 }
53046
53047 // Truncate back to source type.
53048 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
53049 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
53050
53051 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
53052 return DAG.getNode(Opc, DL, VT, LHS, RHS);
53053}
53054
53055// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
53056// from one vector with signed bytes from another vector, adds together
53057// adjacent pairs of 16-bit products, and saturates the result before
53058// truncating to 16-bits.
53059//
53060// Which looks something like this:
53061// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
53062// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
53063static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
53064 const X86Subtarget &Subtarget,
53065 const SDLoc &DL) {
53066 if (!VT.isVector() || !Subtarget.hasSSSE3())
53067 return SDValue();
53068
53069 unsigned NumElems = VT.getVectorNumElements();
53070 EVT ScalarVT = VT.getVectorElementType();
53071 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
53072 return SDValue();
53073
53074 SDValue SSatVal = detectSSatPattern(In, VT);
53075 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
53076 return SDValue();
53077
53078 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
53079 // of multiplies from even/odd elements.
53080 SDValue N0 = SSatVal.getOperand(0);
53081 SDValue N1 = SSatVal.getOperand(1);
53082
53083 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53084 return SDValue();
53085
53086 SDValue N00 = N0.getOperand(0);
53087 SDValue N01 = N0.getOperand(1);
53088 SDValue N10 = N1.getOperand(0);
53089 SDValue N11 = N1.getOperand(1);
53090
53091 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
53092 // Canonicalize zero_extend to LHS.
53093 if (N01.getOpcode() == ISD::ZERO_EXTEND)
53094 std::swap(N00, N01);
53095 if (N11.getOpcode() == ISD::ZERO_EXTEND)
53096 std::swap(N10, N11);
53097
53098 // Ensure we have a zero_extend and a sign_extend.
53099 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
53100 N01.getOpcode() != ISD::SIGN_EXTEND ||
53101 N10.getOpcode() != ISD::ZERO_EXTEND ||
53102 N11.getOpcode() != ISD::SIGN_EXTEND)
53103 return SDValue();
53104
53105 // Peek through the extends.
53106 N00 = N00.getOperand(0);
53107 N01 = N01.getOperand(0);
53108 N10 = N10.getOperand(0);
53109 N11 = N11.getOperand(0);
53110
53111 // Ensure the extend is from vXi8.
53112 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
53113 N01.getValueType().getVectorElementType() != MVT::i8 ||
53114 N10.getValueType().getVectorElementType() != MVT::i8 ||
53115 N11.getValueType().getVectorElementType() != MVT::i8)
53116 return SDValue();
53117
53118 // All inputs should be build_vectors.
53119 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53120 N01.getOpcode() != ISD::BUILD_VECTOR ||
53121 N10.getOpcode() != ISD::BUILD_VECTOR ||
53122 N11.getOpcode() != ISD::BUILD_VECTOR)
53123 return SDValue();
53124
53125 // N00/N10 are zero extended. N01/N11 are sign extended.
53126
53127 // For each element, we need to ensure we have an odd element from one vector
53128 // multiplied by the odd element of another vector and the even element from
53129 // one of the same vectors being multiplied by the even element from the
53130 // other vector. So we need to make sure for each element i, this operator
53131 // is being performed:
53132 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53133 SDValue ZExtIn, SExtIn;
53134 for (unsigned i = 0; i != NumElems; ++i) {
53135 SDValue N00Elt = N00.getOperand(i);
53136 SDValue N01Elt = N01.getOperand(i);
53137 SDValue N10Elt = N10.getOperand(i);
53138 SDValue N11Elt = N11.getOperand(i);
53139 // TODO: Be more tolerant to undefs.
53140 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53141 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53142 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53143 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
53144 return SDValue();
53145 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53146 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53147 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53148 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53149 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53150 return SDValue();
53151 unsigned IdxN00 = ConstN00Elt->getZExtValue();
53152 unsigned IdxN01 = ConstN01Elt->getZExtValue();
53153 unsigned IdxN10 = ConstN10Elt->getZExtValue();
53154 unsigned IdxN11 = ConstN11Elt->getZExtValue();
53155 // Add is commutative so indices can be reordered.
53156 if (IdxN00 > IdxN10) {
53157 std::swap(IdxN00, IdxN10);
53158 std::swap(IdxN01, IdxN11);
53159 }
53160 // N0 indices be the even element. N1 indices must be the next odd element.
53161 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53162 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53163 return SDValue();
53164 SDValue N00In = N00Elt.getOperand(0);
53165 SDValue N01In = N01Elt.getOperand(0);
53166 SDValue N10In = N10Elt.getOperand(0);
53167 SDValue N11In = N11Elt.getOperand(0);
53168 // First time we find an input capture it.
53169 if (!ZExtIn) {
53170 ZExtIn = N00In;
53171 SExtIn = N01In;
53172 }
53173 if (ZExtIn != N00In || SExtIn != N01In ||
53174 ZExtIn != N10In || SExtIn != N11In)
53175 return SDValue();
53176 }
53177
53178 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53179 ArrayRef<SDValue> Ops) {
53180 // Shrink by adding truncate nodes and let DAGCombine fold with the
53181 // sources.
53182 EVT InVT = Ops[0].getValueType();
53183 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__
__PRETTY_FUNCTION__))
53184 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__
__PRETTY_FUNCTION__))
;
53185 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53185, __extension__
__PRETTY_FUNCTION__))
;
53186 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53187 InVT.getVectorNumElements() / 2);
53188 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
53189 };
53190 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
53191 PMADDBuilder);
53192}
53193
53194static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
53195 const X86Subtarget &Subtarget) {
53196 EVT VT = N->getValueType(0);
53197 SDValue Src = N->getOperand(0);
53198 SDLoc DL(N);
53199
53200 // Attempt to pre-truncate inputs to arithmetic ops instead.
53201 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
53202 return V;
53203
53204 // Try to detect AVG pattern first.
53205 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
53206 return Avg;
53207
53208 // Try to detect PMADD
53209 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
53210 return PMAdd;
53211
53212 // Try to combine truncation with signed/unsigned saturation.
53213 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
53214 return Val;
53215
53216 // Try to combine PMULHUW/PMULHW for vXi16.
53217 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
53218 return V;
53219
53220 // The bitcast source is a direct mmx result.
53221 // Detect bitcasts between i32 to x86mmx
53222 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
53223 SDValue BCSrc = Src.getOperand(0);
53224 if (BCSrc.getValueType() == MVT::x86mmx)
53225 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
53226 }
53227
53228 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
53229 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
53230 return V;
53231
53232 return combineVectorTruncation(N, DAG, Subtarget);
53233}
53234
53235static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
53236 TargetLowering::DAGCombinerInfo &DCI) {
53237 EVT VT = N->getValueType(0);
53238 SDValue In = N->getOperand(0);
53239 SDLoc DL(N);
53240
53241 if (SDValue SSatVal = detectSSatPattern(In, VT))
53242 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
53243 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
53244 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
53245
53246 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53247 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
53248 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53249 return SDValue(N, 0);
53250
53251 return SDValue();
53252}
53253
53254/// Returns the negated value if the node \p N flips sign of FP value.
53255///
53256/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53257/// or FSUB(0, x)
53258/// AVX512F does not have FXOR, so FNEG is lowered as
53259/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
53260/// In this case we go though all bitcasts.
53261/// This also recognizes splat of a negated value and returns the splat of that
53262/// value.
53263static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53264 if (N->getOpcode() == ISD::FNEG)
53265 return N->getOperand(0);
53266
53267 // Don't recurse exponentially.
53268 if (Depth > SelectionDAG::MaxRecursionDepth)
53269 return SDValue();
53270
53271 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53272
53273 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
53274 EVT VT = Op->getValueType(0);
53275
53276 // Make sure the element size doesn't change.
53277 if (VT.getScalarSizeInBits() != ScalarSize)
53278 return SDValue();
53279
53280 unsigned Opc = Op.getOpcode();
53281 switch (Opc) {
53282 case ISD::VECTOR_SHUFFLE: {
53283 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
53284 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
53285 if (!Op.getOperand(1).isUndef())
53286 return SDValue();
53287 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53288 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
53289 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
53290 cast<ShuffleVectorSDNode>(Op)->getMask());
53291 break;
53292 }
53293 case ISD::INSERT_VECTOR_ELT: {
53294 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
53295 // -V, INDEX).
53296 SDValue InsVector = Op.getOperand(0);
53297 SDValue InsVal = Op.getOperand(1);
53298 if (!InsVector.isUndef())
53299 return SDValue();
53300 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
53301 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
53302 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
53303 NegInsVal, Op.getOperand(2));
53304 break;
53305 }
53306 case ISD::FSUB:
53307 case ISD::XOR:
53308 case X86ISD::FXOR: {
53309 SDValue Op1 = Op.getOperand(1);
53310 SDValue Op0 = Op.getOperand(0);
53311
53312 // For XOR and FXOR, we want to check if constant
53313 // bits of Op1 are sign bit masks. For FSUB, we
53314 // have to check if constant bits of Op0 are sign
53315 // bit masks and hence we swap the operands.
53316 if (Opc == ISD::FSUB)
53317 std::swap(Op0, Op1);
53318
53319 APInt UndefElts;
53320 SmallVector<APInt, 16> EltBits;
53321 // Extract constant bits and see if they are all
53322 // sign bit masks. Ignore the undef elements.
53323 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
53324 /* AllowWholeUndefs */ true,
53325 /* AllowPartialUndefs */ false)) {
53326 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
53327 if (!UndefElts[I] && !EltBits[I].isSignMask())
53328 return SDValue();
53329
53330 // Only allow bitcast from correctly-sized constant.
53331 Op0 = peekThroughBitcasts(Op0);
53332 if (Op0.getScalarValueSizeInBits() == ScalarSize)
53333 return Op0;
53334 }
53335 break;
53336 } // case
53337 } // switch
53338
53339 return SDValue();
53340}
53341
53342static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
53343 bool NegRes) {
53344 if (NegMul) {
53345 switch (Opcode) {
53346 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53346)
;
53347 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
53348 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
53349 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
53350 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
53351 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
53352 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
53353 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
53354 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
53355 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
53356 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
53357 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
53358 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
53359 }
53360 }
53361
53362 if (NegAcc) {
53363 switch (Opcode) {
53364 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53364)
;
53365 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
53366 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
53367 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53368 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
53369 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
53370 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53371 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
53372 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
53373 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53374 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
53375 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
53376 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53377 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
53378 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
53379 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
53380 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
53381 }
53382 }
53383
53384 if (NegRes) {
53385 switch (Opcode) {
53386 // For accuracy reason, we never combine fneg and fma under strict FP.
53387 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53387)
;
53388 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
53389 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53390 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
53391 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53392 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
53393 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53394 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
53395 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53396 }
53397 }
53398
53399 return Opcode;
53400}
53401
53402/// Do target-specific dag combines on floating point negations.
53403static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
53404 TargetLowering::DAGCombinerInfo &DCI,
53405 const X86Subtarget &Subtarget) {
53406 EVT OrigVT = N->getValueType(0);
53407 SDValue Arg = isFNEG(DAG, N);
53408 if (!Arg)
53409 return SDValue();
53410
53411 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53412 EVT VT = Arg.getValueType();
53413 EVT SVT = VT.getScalarType();
53414 SDLoc DL(N);
53415
53416 // Let legalize expand this if it isn't a legal type yet.
53417 if (!TLI.isTypeLegal(VT))
53418 return SDValue();
53419
53420 // If we're negating a FMUL node on a target with FMA, then we can avoid the
53421 // use of a constant by performing (-0 - A*B) instead.
53422 // FIXME: Check rounding control flags as well once it becomes available.
53423 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
53424 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
53425 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
53426 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
53427 Arg.getOperand(1), Zero);
53428 return DAG.getBitcast(OrigVT, NewNode);
53429 }
53430
53431 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53432 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53433 if (SDValue NegArg =
53434 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
53435 return DAG.getBitcast(OrigVT, NegArg);
53436
53437 return SDValue();
53438}
53439
53440SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
53441 bool LegalOperations,
53442 bool ForCodeSize,
53443 NegatibleCost &Cost,
53444 unsigned Depth) const {
53445 // fneg patterns are removable even if they have multiple uses.
53446 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
53447 Cost = NegatibleCost::Cheaper;
53448 return DAG.getBitcast(Op.getValueType(), Arg);
53449 }
53450
53451 EVT VT = Op.getValueType();
53452 EVT SVT = VT.getScalarType();
53453 unsigned Opc = Op.getOpcode();
53454 SDNodeFlags Flags = Op.getNode()->getFlags();
53455 switch (Opc) {
53456 case ISD::FMA:
53457 case X86ISD::FMSUB:
53458 case X86ISD::FNMADD:
53459 case X86ISD::FNMSUB:
53460 case X86ISD::FMADD_RND:
53461 case X86ISD::FMSUB_RND:
53462 case X86ISD::FNMADD_RND:
53463 case X86ISD::FNMSUB_RND: {
53464 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
53465 !(SVT == MVT::f32 || SVT == MVT::f64) ||
53466 !isOperationLegal(ISD::FMA, VT))
53467 break;
53468
53469 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
53470 // if it may have signed zeros.
53471 if (!Flags.hasNoSignedZeros())
53472 break;
53473
53474 // This is always negatible for free but we might be able to remove some
53475 // extra operand negations as well.
53476 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
53477 for (int i = 0; i != 3; ++i)
53478 NewOps[i] = getCheaperNegatedExpression(
53479 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
53480
53481 bool NegA = !!NewOps[0];
53482 bool NegB = !!NewOps[1];
53483 bool NegC = !!NewOps[2];
53484 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
53485
53486 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
53487 : NegatibleCost::Neutral;
53488
53489 // Fill in the non-negated ops with the original values.
53490 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
53491 if (!NewOps[i])
53492 NewOps[i] = Op.getOperand(i);
53493 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
53494 }
53495 case X86ISD::FRCP:
53496 if (SDValue NegOp0 =
53497 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
53498 ForCodeSize, Cost, Depth + 1))
53499 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
53500 break;
53501 }
53502
53503 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
53504 ForCodeSize, Cost, Depth);
53505}
53506
53507static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
53508 const X86Subtarget &Subtarget) {
53509 MVT VT = N->getSimpleValueType(0);
53510 // If we have integer vector types available, use the integer opcodes.
53511 if (!VT.isVector() || !Subtarget.hasSSE2())
53512 return SDValue();
53513
53514 SDLoc dl(N);
53515
53516 unsigned IntBits = VT.getScalarSizeInBits();
53517 MVT IntSVT = MVT::getIntegerVT(IntBits);
53518 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
53519
53520 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
53521 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
53522 unsigned IntOpcode;
53523 switch (N->getOpcode()) {
53524 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53524)
;
53525 case X86ISD::FOR: IntOpcode = ISD::OR; break;
53526 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
53527 case X86ISD::FAND: IntOpcode = ISD::AND; break;
53528 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
53529 }
53530 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
53531 return DAG.getBitcast(VT, IntOp);
53532}
53533
53534
53535/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
53536static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
53537 if (N->getOpcode() != ISD::XOR)
53538 return SDValue();
53539
53540 SDValue LHS = N->getOperand(0);
53541 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
53542 return SDValue();
53543
53544 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
53545 X86::CondCode(LHS->getConstantOperandVal(0)));
53546 SDLoc DL(N);
53547 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
53548}
53549
53550static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
53551 const X86Subtarget &Subtarget) {
53552 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__
__PRETTY_FUNCTION__))
53553 "Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__
__PRETTY_FUNCTION__))
;
53554 if (Subtarget.hasFastLZCNT())
53555 return SDValue();
53556
53557 EVT VT = N->getValueType(0);
53558 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
53559 (VT != MVT::i64 || !Subtarget.is64Bit()))
53560 return SDValue();
53561
53562 SDValue N0 = N->getOperand(0);
53563 SDValue N1 = N->getOperand(1);
53564
53565 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
53566 N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
53567 return SDValue();
53568
53569 SDValue OpCTLZ;
53570 SDValue OpSizeTM1;
53571
53572 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
53573 OpCTLZ = N1;
53574 OpSizeTM1 = N0;
53575 } else if (N->getOpcode() == ISD::SUB) {
53576 return SDValue();
53577 } else {
53578 OpCTLZ = N0;
53579 OpSizeTM1 = N1;
53580 }
53581
53582 if (!OpCTLZ.hasOneUse())
53583 return SDValue();
53584 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
53585 if (!C)
53586 return SDValue();
53587
53588 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
53589 return SDValue();
53590 SDLoc DL(N);
53591 EVT OpVT = VT;
53592 SDValue Op = OpCTLZ.getOperand(0);
53593 if (VT == MVT::i8) {
53594 // Zero extend to i32 since there is not an i8 bsr.
53595 OpVT = MVT::i32;
53596 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
53597 }
53598
53599 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
53600 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
53601 if (VT == MVT::i8)
53602 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
53603
53604 return Op;
53605}
53606
53607static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
53608 TargetLowering::DAGCombinerInfo &DCI,
53609 const X86Subtarget &Subtarget) {
53610 SDValue N0 = N->getOperand(0);
53611 SDValue N1 = N->getOperand(1);
53612 EVT VT = N->getValueType(0);
53613
53614 // If this is SSE1 only convert to FXOR to avoid scalarization.
53615 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
53616 return DAG.getBitcast(MVT::v4i32,
53617 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
53618 DAG.getBitcast(MVT::v4f32, N0),
53619 DAG.getBitcast(MVT::v4f32, N1)));
53620 }
53621
53622 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
53623 return Cmp;
53624
53625 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
53626 return R;
53627
53628 if (SDValue R = combineBitOpWithShift(N, DAG))
53629 return R;
53630
53631 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
53632 return FPLogic;
53633
53634 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
53635 return R;
53636
53637 if (DCI.isBeforeLegalizeOps())
53638 return SDValue();
53639
53640 if (SDValue SetCC = foldXor1SetCC(N, DAG))
53641 return SetCC;
53642
53643 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
53644 return R;
53645
53646 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
53647 return RV;
53648
53649 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
53650 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53651 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
53652 N0.getOperand(0).getValueType().isVector() &&
53653 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53654 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
53655 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
53656 N0.getOperand(0).getValueType()));
53657 }
53658
53659 // Handle AVX512 mask widening.
53660 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
53661 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
53662 VT.getVectorElementType() == MVT::i1 &&
53663 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
53664 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
53665 return DAG.getNode(
53666 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
53667 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
53668 N0.getOperand(2));
53669 }
53670
53671 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
53672 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
53673 // TODO: Under what circumstances could this be performed in DAGCombine?
53674 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
53675 N0.getOperand(0).getOpcode() == N->getOpcode()) {
53676 SDValue TruncExtSrc = N0.getOperand(0);
53677 auto *N1C = dyn_cast<ConstantSDNode>(N1);
53678 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
53679 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
53680 SDLoc DL(N);
53681 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
53682 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
53683 return DAG.getNode(ISD::XOR, DL, VT, LHS,
53684 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
53685 }
53686 }
53687
53688 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
53689 return R;
53690
53691 return combineFneg(N, DAG, DCI, Subtarget);
53692}
53693
53694static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
53695 TargetLowering::DAGCombinerInfo &DCI,
53696 const X86Subtarget &Subtarget) {
53697 EVT VT = N->getValueType(0);
53698 unsigned NumBits = VT.getSizeInBits();
53699
53700 // TODO - Constant Folding.
53701
53702 // Simplify the inputs.
53703 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53704 APInt DemandedMask(APInt::getAllOnes(NumBits));
53705 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53706 return SDValue(N, 0);
53707
53708 return SDValue();
53709}
53710
53711static bool isNullFPScalarOrVectorConst(SDValue V) {
53712 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
53713}
53714
53715/// If a value is a scalar FP zero or a vector FP zero (potentially including
53716/// undefined elements), return a zero constant that may be used to fold away
53717/// that value. In the case of a vector, the returned constant will not contain
53718/// undefined elements even if the input parameter does. This makes it suitable
53719/// to be used as a replacement operand with operations (eg, bitwise-and) where
53720/// an undef should not propagate.
53721static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
53722 const X86Subtarget &Subtarget) {
53723 if (!isNullFPScalarOrVectorConst(V))
53724 return SDValue();
53725
53726 if (V.getValueType().isVector())
53727 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
53728
53729 return V;
53730}
53731
53732static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
53733 const X86Subtarget &Subtarget) {
53734 SDValue N0 = N->getOperand(0);
53735 SDValue N1 = N->getOperand(1);
53736 EVT VT = N->getValueType(0);
53737 SDLoc DL(N);
53738
53739 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
53740 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
53741 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
53742 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
53743 return SDValue();
53744
53745 auto isAllOnesConstantFP = [](SDValue V) {
53746 if (V.getSimpleValueType().isVector())
53747 return ISD::isBuildVectorAllOnes(V.getNode());
53748 auto *C = dyn_cast<ConstantFPSDNode>(V);
53749 return C && C->getConstantFPValue()->isAllOnesValue();
53750 };
53751
53752 // fand (fxor X, -1), Y --> fandn X, Y
53753 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
53754 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
53755
53756 // fand X, (fxor Y, -1) --> fandn Y, X
53757 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
53758 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
53759
53760 return SDValue();
53761}
53762
53763/// Do target-specific dag combines on X86ISD::FAND nodes.
53764static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
53765 const X86Subtarget &Subtarget) {
53766 // FAND(0.0, x) -> 0.0
53767 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
53768 return V;
53769
53770 // FAND(x, 0.0) -> 0.0
53771 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53772 return V;
53773
53774 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
53775 return V;
53776
53777 return lowerX86FPLogicOp(N, DAG, Subtarget);
53778}
53779
53780/// Do target-specific dag combines on X86ISD::FANDN nodes.
53781static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
53782 const X86Subtarget &Subtarget) {
53783 // FANDN(0.0, x) -> x
53784 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53785 return N->getOperand(1);
53786
53787 // FANDN(x, 0.0) -> 0.0
53788 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53789 return V;
53790
53791 return lowerX86FPLogicOp(N, DAG, Subtarget);
53792}
53793
53794/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
53795static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
53796 TargetLowering::DAGCombinerInfo &DCI,
53797 const X86Subtarget &Subtarget) {
53798 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53798, __extension__
__PRETTY_FUNCTION__))
;
53799
53800 // F[X]OR(0.0, x) -> x
53801 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53802 return N->getOperand(1);
53803
53804 // F[X]OR(x, 0.0) -> x
53805 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
53806 return N->getOperand(0);
53807
53808 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
53809 return NewVal;
53810
53811 return lowerX86FPLogicOp(N, DAG, Subtarget);
53812}
53813
53814/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
53815static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
53816 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53816, __extension__
__PRETTY_FUNCTION__))
;
53817
53818 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
53819 if (!DAG.getTarget().Options.NoNaNsFPMath ||
53820 !DAG.getTarget().Options.NoSignedZerosFPMath)
53821 return SDValue();
53822
53823 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
53824 // into FMINC and FMAXC, which are Commutative operations.
53825 unsigned NewOp = 0;
53826 switch (N->getOpcode()) {
53827 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53827)
;
53828 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
53829 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
53830 }
53831
53832 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
53833 N->getOperand(0), N->getOperand(1));
53834}
53835
53836static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
53837 const X86Subtarget &Subtarget) {
53838 EVT VT = N->getValueType(0);
53839 if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
53840 return SDValue();
53841
53842 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53843
53844 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
53845 (Subtarget.hasSSE2() && VT == MVT::f64) ||
53846 (Subtarget.hasFP16() && VT == MVT::f16) ||
53847 (VT.isVector() && TLI.isTypeLegal(VT))))
53848 return SDValue();
53849
53850 SDValue Op0 = N->getOperand(0);
53851 SDValue Op1 = N->getOperand(1);
53852 SDLoc DL(N);
53853 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
53854
53855 // If we don't have to respect NaN inputs, this is a direct translation to x86
53856 // min/max instructions.
53857 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
53858 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53859
53860 // If one of the operands is known non-NaN use the native min/max instructions
53861 // with the non-NaN input as second operand.
53862 if (DAG.isKnownNeverNaN(Op1))
53863 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53864 if (DAG.isKnownNeverNaN(Op0))
53865 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
53866
53867 // If we have to respect NaN inputs, this takes at least 3 instructions.
53868 // Favor a library call when operating on a scalar and minimizing code size.
53869 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
53870 return SDValue();
53871
53872 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
53873 VT);
53874
53875 // There are 4 possibilities involving NaN inputs, and these are the required
53876 // outputs:
53877 // Op1
53878 // Num NaN
53879 // ----------------
53880 // Num | Max | Op0 |
53881 // Op0 ----------------
53882 // NaN | Op1 | NaN |
53883 // ----------------
53884 //
53885 // The SSE FP max/min instructions were not designed for this case, but rather
53886 // to implement:
53887 // Min = Op1 < Op0 ? Op1 : Op0
53888 // Max = Op1 > Op0 ? Op1 : Op0
53889 //
53890 // So they always return Op0 if either input is a NaN. However, we can still
53891 // use those instructions for fmaxnum by selecting away a NaN input.
53892
53893 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
53894 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
53895 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
53896
53897 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
53898 // are NaN, the NaN value of Op1 is the result.
53899 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
53900}
53901
53902static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
53903 TargetLowering::DAGCombinerInfo &DCI) {
53904 EVT VT = N->getValueType(0);
53905 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53906
53907 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53908 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
53909 return SDValue(N, 0);
53910
53911 // Convert a full vector load into vzload when not all bits are needed.
53912 SDValue In = N->getOperand(0);
53913 MVT InVT = In.getSimpleValueType();
53914 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53915 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53916 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53916, __extension__
__PRETTY_FUNCTION__))
;
53917 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
53918 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53919 MVT MemVT = MVT::getIntegerVT(NumBits);
53920 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53921 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53922 SDLoc dl(N);
53923 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
53924 DAG.getBitcast(InVT, VZLoad));
53925 DCI.CombineTo(N, Convert);
53926 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53927 DCI.recursivelyDeleteUnusedNodes(LN);
53928 return SDValue(N, 0);
53929 }
53930 }
53931
53932 return SDValue();
53933}
53934
53935static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
53936 TargetLowering::DAGCombinerInfo &DCI) {
53937 bool IsStrict = N->isTargetStrictFPOpcode();
53938 EVT VT = N->getValueType(0);
53939
53940 // Convert a full vector load into vzload when not all bits are needed.
53941 SDValue In = N->getOperand(IsStrict ? 1 : 0);
53942 MVT InVT = In.getSimpleValueType();
53943 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53944 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53945 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53945, __extension__
__PRETTY_FUNCTION__))
;
53946 LoadSDNode *LN = cast<LoadSDNode>(In);
53947 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53948 MVT MemVT = MVT::getFloatingPointVT(NumBits);
53949 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53950 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53951 SDLoc dl(N);
53952 if (IsStrict) {
53953 SDValue Convert =
53954 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
53955 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
53956 DCI.CombineTo(N, Convert, Convert.getValue(1));
53957 } else {
53958 SDValue Convert =
53959 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
53960 DCI.CombineTo(N, Convert);
53961 }
53962 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53963 DCI.recursivelyDeleteUnusedNodes(LN);
53964 return SDValue(N, 0);
53965 }
53966 }
53967
53968 return SDValue();
53969}
53970
53971/// Do target-specific dag combines on X86ISD::ANDNP nodes.
53972static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
53973 TargetLowering::DAGCombinerInfo &DCI,
53974 const X86Subtarget &Subtarget) {
53975 SDValue N0 = N->getOperand(0);
53976 SDValue N1 = N->getOperand(1);
53977 MVT VT = N->getSimpleValueType(0);
53978 int NumElts = VT.getVectorNumElements();
53979 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53980
53981 // ANDNP(undef, x) -> 0
53982 // ANDNP(x, undef) -> 0
53983 if (N0.isUndef() || N1.isUndef())
53984 return DAG.getConstant(0, SDLoc(N), VT);
53985
53986 // ANDNP(0, x) -> x
53987 if (ISD::isBuildVectorAllZeros(N0.getNode()))
53988 return N1;
53989
53990 // ANDNP(x, 0) -> 0
53991 if (ISD::isBuildVectorAllZeros(N1.getNode()))
53992 return DAG.getConstant(0, SDLoc(N), VT);
53993
53994 // Turn ANDNP back to AND if input is inverted.
53995 if (SDValue Not = IsNOT(N0, DAG))
53996 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
53997
53998 // Constant Folding
53999 APInt Undefs0, Undefs1;
54000 SmallVector<APInt> EltBits0, EltBits1;
54001 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
54002 SDLoc DL(N);
54003 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
54004 SmallVector<APInt> ResultBits;
54005 for (int I = 0; I != NumElts; ++I)
54006 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
54007 return getConstVector(ResultBits, VT, DAG, DL);
54008 }
54009
54010 // Constant fold NOT(N0) to allow us to use AND.
54011 // Ensure this is only performed if we can confirm that the bitcasted source
54012 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
54013 if (N0->hasOneUse()) {
54014 SDValue BC0 = peekThroughOneUseBitcasts(N0);
54015 if (BC0.getOpcode() != ISD::BITCAST) {
54016 for (APInt &Elt : EltBits0)
54017 Elt = ~Elt;
54018 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
54019 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
54020 }
54021 }
54022 }
54023
54024 // Attempt to recursively combine a bitmask ANDNP with shuffles.
54025 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
54026 SDValue Op(N, 0);
54027 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54028 return Res;
54029
54030 // If either operand is a constant mask, then only the elements that aren't
54031 // zero are actually demanded by the other operand.
54032 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
54033 APInt UndefElts;
54034 SmallVector<APInt> EltBits;
54035 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
54036 APInt DemandedElts = APInt::getAllOnes(NumElts);
54037 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
54038 EltBits)) {
54039 DemandedBits.clearAllBits();
54040 DemandedElts.clearAllBits();
54041 for (int I = 0; I != NumElts; ++I) {
54042 if (UndefElts[I]) {
54043 // We can't assume an undef src element gives an undef dst - the
54044 // other src might be zero.
54045 DemandedBits.setAllBits();
54046 DemandedElts.setBit(I);
54047 } else if ((Invert && !EltBits[I].isAllOnes()) ||
54048 (!Invert && !EltBits[I].isZero())) {
54049 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
54050 DemandedElts.setBit(I);
54051 }
54052 }
54053 }
54054 return std::make_pair(DemandedBits, DemandedElts);
54055 };
54056 APInt Bits0, Elts0;
54057 APInt Bits1, Elts1;
54058 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
54059 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
54060
54061 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54062 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
54063 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
54064 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
54065 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
54066 if (N->getOpcode() != ISD::DELETED_NODE)
54067 DCI.AddToWorklist(N);
54068 return SDValue(N, 0);
54069 }
54070 }
54071
54072 return SDValue();
54073}
54074
54075static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
54076 TargetLowering::DAGCombinerInfo &DCI) {
54077 SDValue N1 = N->getOperand(1);
54078
54079 // BT ignores high bits in the bit index operand.
54080 unsigned BitWidth = N1.getValueSizeInBits();
54081 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
54082 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
54083 if (N->getOpcode() != ISD::DELETED_NODE)
54084 DCI.AddToWorklist(N);
54085 return SDValue(N, 0);
54086 }
54087
54088 return SDValue();
54089}
54090
54091static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
54092 TargetLowering::DAGCombinerInfo &DCI) {
54093 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54094 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54095
54096 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54097 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54098 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
54099 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
54100 if (N->getOpcode() != ISD::DELETED_NODE)
54101 DCI.AddToWorklist(N);
54102 return SDValue(N, 0);
54103 }
54104
54105 // Convert a full vector load into vzload when not all bits are needed.
54106 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54107 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54108 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
54109 SDLoc dl(N);
54110 if (IsStrict) {
54111 SDValue Convert = DAG.getNode(
54112 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54113 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54114 DCI.CombineTo(N, Convert, Convert.getValue(1));
54115 } else {
54116 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54117 DAG.getBitcast(MVT::v8i16, VZLoad));
54118 DCI.CombineTo(N, Convert);
54119 }
54120
54121 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54122 DCI.recursivelyDeleteUnusedNodes(LN);
54123 return SDValue(N, 0);
54124 }
54125 }
54126 }
54127
54128 return SDValue();
54129}
54130
54131// Try to combine sext_in_reg of a cmov of constants by extending the constants.
54132static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
54133 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54133, __extension__
__PRETTY_FUNCTION__))
;
54134
54135 EVT DstVT = N->getValueType(0);
54136
54137 SDValue N0 = N->getOperand(0);
54138 SDValue N1 = N->getOperand(1);
54139 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54140
54141 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
54142 return SDValue();
54143
54144 // Look through single use any_extends / truncs.
54145 SDValue IntermediateBitwidthOp;
54146 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
54147 N0.hasOneUse()) {
54148 IntermediateBitwidthOp = N0;
54149 N0 = N0.getOperand(0);
54150 }
54151
54152 // See if we have a single use cmov.
54153 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
54154 return SDValue();
54155
54156 SDValue CMovOp0 = N0.getOperand(0);
54157 SDValue CMovOp1 = N0.getOperand(1);
54158
54159 // Make sure both operands are constants.
54160 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54161 !isa<ConstantSDNode>(CMovOp1.getNode()))
54162 return SDValue();
54163
54164 SDLoc DL(N);
54165
54166 // If we looked through an any_extend/trunc above, add one to the constants.
54167 if (IntermediateBitwidthOp) {
54168 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
54169 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
54170 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
54171 }
54172
54173 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
54174 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
54175
54176 EVT CMovVT = DstVT;
54177 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
54178 if (DstVT == MVT::i16) {
54179 CMovVT = MVT::i32;
54180 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
54181 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
54182 }
54183
54184 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
54185 N0.getOperand(2), N0.getOperand(3));
54186
54187 if (CMovVT != DstVT)
54188 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
54189
54190 return CMov;
54191}
54192
54193static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
54194 const X86Subtarget &Subtarget) {
54195 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54195, __extension__
__PRETTY_FUNCTION__))
;
54196
54197 if (SDValue V = combineSextInRegCmov(N, DAG))
54198 return V;
54199
54200 EVT VT = N->getValueType(0);
54201 SDValue N0 = N->getOperand(0);
54202 SDValue N1 = N->getOperand(1);
54203 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54204 SDLoc dl(N);
54205
54206 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
54207 // both SSE and AVX2 since there is no sign-extended shift right
54208 // operation on a vector with 64-bit elements.
54209 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
54210 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
54211 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
54212 N0.getOpcode() == ISD::SIGN_EXTEND)) {
54213 SDValue N00 = N0.getOperand(0);
54214
54215 // EXTLOAD has a better solution on AVX2,
54216 // it may be replaced with X86ISD::VSEXT node.
54217 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
54218 if (!ISD::isNormalLoad(N00.getNode()))
54219 return SDValue();
54220
54221 // Attempt to promote any comparison mask ops before moving the
54222 // SIGN_EXTEND_INREG in the way.
54223 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
54224 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
54225
54226 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
54227 SDValue Tmp =
54228 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
54229 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
54230 }
54231 }
54232 return SDValue();
54233}
54234
54235/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
54236/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
54237/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
54238/// opportunities to combine math ops, use an LEA, or use a complex addressing
54239/// mode. This can eliminate extend, add, and shift instructions.
54240static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
54241 const X86Subtarget &Subtarget) {
54242 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
54243 Ext->getOpcode() != ISD::ZERO_EXTEND)
54244 return SDValue();
54245
54246 // TODO: This should be valid for other integer types.
54247 EVT VT = Ext->getValueType(0);
54248 if (VT != MVT::i64)
54249 return SDValue();
54250
54251 SDValue Add = Ext->getOperand(0);
54252 if (Add.getOpcode() != ISD::ADD)
54253 return SDValue();
54254
54255 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
54256 bool NSW = Add->getFlags().hasNoSignedWrap();
54257 bool NUW = Add->getFlags().hasNoUnsignedWrap();
54258
54259 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
54260 // into the 'zext'
54261 if ((Sext && !NSW) || (!Sext && !NUW))
54262 return SDValue();
54263
54264 // Having a constant operand to the 'add' ensures that we are not increasing
54265 // the instruction count because the constant is extended for free below.
54266 // A constant operand can also become the displacement field of an LEA.
54267 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
54268 if (!AddOp1)
54269 return SDValue();
54270
54271 // Don't make the 'add' bigger if there's no hope of combining it with some
54272 // other 'add' or 'shl' instruction.
54273 // TODO: It may be profitable to generate simpler LEA instructions in place
54274 // of single 'add' instructions, but the cost model for selecting an LEA
54275 // currently has a high threshold.
54276 bool HasLEAPotential = false;
54277 for (auto *User : Ext->uses()) {
54278 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
54279 HasLEAPotential = true;
54280 break;
54281 }
54282 }
54283 if (!HasLEAPotential)
54284 return SDValue();
54285
54286 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
54287 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
54288 SDValue AddOp0 = Add.getOperand(0);
54289 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
54290 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
54291
54292 // The wider add is guaranteed to not wrap because both operands are
54293 // sign-extended.
54294 SDNodeFlags Flags;
54295 Flags.setNoSignedWrap(NSW);
54296 Flags.setNoUnsignedWrap(NUW);
54297 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
54298}
54299
54300// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
54301// operands and the result of CMOV is not used anywhere else - promote CMOV
54302// itself instead of promoting its result. This could be beneficial, because:
54303// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
54304// (or more) pseudo-CMOVs only when they go one-after-another and
54305// getting rid of result extension code after CMOV will help that.
54306// 2) Promotion of constant CMOV arguments is free, hence the
54307// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
54308// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
54309// promotion is also good in terms of code-size.
54310// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
54311// promotion).
54312static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
54313 SDValue CMovN = Extend->getOperand(0);
54314 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
54315 return SDValue();
54316
54317 EVT TargetVT = Extend->getValueType(0);
54318 unsigned ExtendOpcode = Extend->getOpcode();
54319 SDLoc DL(Extend);
54320
54321 EVT VT = CMovN.getValueType();
54322 SDValue CMovOp0 = CMovN.getOperand(0);
54323 SDValue CMovOp1 = CMovN.getOperand(1);
54324
54325 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54326 !isa<ConstantSDNode>(CMovOp1.getNode()))
54327 return SDValue();
54328
54329 // Only extend to i32 or i64.
54330 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
54331 return SDValue();
54332
54333 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
54334 // are free.
54335 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
54336 return SDValue();
54337
54338 // If this a zero extend to i64, we should only extend to i32 and use a free
54339 // zero extend to finish.
54340 EVT ExtendVT = TargetVT;
54341 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
54342 ExtendVT = MVT::i32;
54343
54344 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
54345 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
54346
54347 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
54348 CMovN.getOperand(2), CMovN.getOperand(3));
54349
54350 // Finish extending if needed.
54351 if (ExtendVT != TargetVT)
54352 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
54353
54354 return Res;
54355}
54356
54357// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
54358// result type.
54359static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
54360 const X86Subtarget &Subtarget) {
54361 SDValue N0 = N->getOperand(0);
54362 EVT VT = N->getValueType(0);
54363 SDLoc dl(N);
54364
54365 // Only do this combine with AVX512 for vector extends.
54366 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
54367 return SDValue();
54368
54369 // Only combine legal element types.
54370 EVT SVT = VT.getVectorElementType();
54371 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
54372 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
54373 return SDValue();
54374
54375 // We don't have CMPP Instruction for vxf16
54376 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
54377 return SDValue();
54378 // We can only do this if the vector size in 256 bits or less.
54379 unsigned Size = VT.getSizeInBits();
54380 if (Size > 256 && Subtarget.useAVX512Regs())
54381 return SDValue();
54382
54383 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
54384 // that's the only integer compares with we have.
54385 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
54386 if (ISD::isUnsignedIntSetCC(CC))
54387 return SDValue();
54388
54389 // Only do this combine if the extension will be fully consumed by the setcc.
54390 EVT N00VT = N0.getOperand(0).getValueType();
54391 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
54392 if (Size != MatchingVecType.getSizeInBits())
54393 return SDValue();
54394
54395 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
54396
54397 if (N->getOpcode() == ISD::ZERO_EXTEND)
54398 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
54399
54400 return Res;
54401}
54402
54403static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
54404 TargetLowering::DAGCombinerInfo &DCI,
54405 const X86Subtarget &Subtarget) {
54406 SDValue N0 = N->getOperand(0);
54407 EVT VT = N->getValueType(0);
54408 SDLoc DL(N);
54409
54410 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54411 if (!DCI.isBeforeLegalizeOps() &&
54412 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54413 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
54414 N0->getOperand(1));
54415 bool ReplaceOtherUses = !N0.hasOneUse();
54416 DCI.CombineTo(N, Setcc);
54417 // Replace other uses with a truncate of the widened setcc_carry.
54418 if (ReplaceOtherUses) {
54419 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54420 N0.getValueType(), Setcc);
54421 DCI.CombineTo(N0.getNode(), Trunc);
54422 }
54423
54424 return SDValue(N, 0);
54425 }
54426
54427 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54428 return NewCMov;
54429
54430 if (!DCI.isBeforeLegalizeOps())
54431 return SDValue();
54432
54433 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54434 return V;
54435
54436 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
54437 DAG, DCI, Subtarget))
54438 return V;
54439
54440 if (VT.isVector()) {
54441 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54442 return R;
54443
54444 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
54445 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
54446 }
54447
54448 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54449 return NewAdd;
54450
54451 return SDValue();
54452}
54453
54454static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
54455 TargetLowering::DAGCombinerInfo &DCI,
54456 const X86Subtarget &Subtarget) {
54457 SDLoc dl(N);
54458 EVT VT = N->getValueType(0);
54459 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
54460
54461 // Let legalize expand this if it isn't a legal type yet.
54462 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54463 if (!TLI.isTypeLegal(VT))
54464 return SDValue();
54465
54466 SDValue A = N->getOperand(IsStrict ? 1 : 0);
54467 SDValue B = N->getOperand(IsStrict ? 2 : 1);
54468 SDValue C = N->getOperand(IsStrict ? 3 : 2);
54469
54470 // If the operation allows fast-math and the target does not support FMA,
54471 // split this into mul+add to avoid libcall(s).
54472 SDNodeFlags Flags = N->getFlags();
54473 if (!IsStrict && Flags.hasAllowReassociation() &&
54474 TLI.isOperationExpand(ISD::FMA, VT)) {
54475 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
54476 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
54477 }
54478
54479 EVT ScalarVT = VT.getScalarType();
54480 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
54481 !Subtarget.hasAnyFMA()) &&
54482 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
54483 return SDValue();
54484
54485 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
54486 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54487 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54488 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
54489 CodeSize)) {
54490 V = NegV;
54491 return true;
54492 }
54493 // Look through extract_vector_elts. If it comes from an FNEG, create a
54494 // new extract from the FNEG input.
54495 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54496 isNullConstant(V.getOperand(1))) {
54497 SDValue Vec = V.getOperand(0);
54498 if (SDValue NegV = TLI.getCheaperNegatedExpression(
54499 Vec, DAG, LegalOperations, CodeSize)) {
54500 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
54501 NegV, V.getOperand(1));
54502 return true;
54503 }
54504 }
54505
54506 return false;
54507 };
54508
54509 // Do not convert the passthru input of scalar intrinsics.
54510 // FIXME: We could allow negations of the lower element only.
54511 bool NegA = invertIfNegative(A);
54512 bool NegB = invertIfNegative(B);
54513 bool NegC = invertIfNegative(C);
54514
54515 if (!NegA && !NegB && !NegC)
54516 return SDValue();
54517
54518 unsigned NewOpcode =
54519 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
54520
54521 // Propagate fast-math-flags to new FMA node.
54522 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
54523 if (IsStrict) {
54524 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54524, __extension__
__PRETTY_FUNCTION__))
;
54525 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
54526 {N->getOperand(0), A, B, C});
54527 } else {
54528 if (N->getNumOperands() == 4)
54529 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
54530 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
54531 }
54532}
54533
54534// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
54535// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
54536static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
54537 TargetLowering::DAGCombinerInfo &DCI) {
54538 SDLoc dl(N);
54539 EVT VT = N->getValueType(0);
54540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54541 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54542 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54543
54544 SDValue N2 = N->getOperand(2);
54545
54546 SDValue NegN2 =
54547 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
54548 if (!NegN2)
54549 return SDValue();
54550 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
54551
54552 if (N->getNumOperands() == 4)
54553 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54554 NegN2, N->getOperand(3));
54555 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54556 NegN2);
54557}
54558
54559static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
54560 TargetLowering::DAGCombinerInfo &DCI,
54561 const X86Subtarget &Subtarget) {
54562 SDLoc dl(N);
54563 SDValue N0 = N->getOperand(0);
54564 EVT VT = N->getValueType(0);
54565
54566 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54567 // FIXME: Is this needed? We don't seem to have any tests for it.
54568 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
54569 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54570 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
54571 N0->getOperand(1));
54572 bool ReplaceOtherUses = !N0.hasOneUse();
54573 DCI.CombineTo(N, Setcc);
54574 // Replace other uses with a truncate of the widened setcc_carry.
54575 if (ReplaceOtherUses) {
54576 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54577 N0.getValueType(), Setcc);
54578 DCI.CombineTo(N0.getNode(), Trunc);
54579 }
54580
54581 return SDValue(N, 0);
54582 }
54583
54584 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54585 return NewCMov;
54586
54587 if (DCI.isBeforeLegalizeOps())
54588 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54589 return V;
54590
54591 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
54592 DAG, DCI, Subtarget))
54593 return V;
54594
54595 if (VT.isVector())
54596 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54597 return R;
54598
54599 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54600 return NewAdd;
54601
54602 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
54603 return R;
54604
54605 // TODO: Combine with any target/faux shuffle.
54606 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
54607 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
54608 SDValue N00 = N0.getOperand(0);
54609 SDValue N01 = N0.getOperand(1);
54610 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
54611 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
54612 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
54613 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
54614 return concatSubVectors(N00, N01, DAG, dl);
54615 }
54616 }
54617
54618 return SDValue();
54619}
54620
54621/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
54622/// pre-promote its result type since vXi1 vectors don't get promoted
54623/// during type legalization.
54624static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
54625 SDValue RHS, ISD::CondCode CC,
54626 const SDLoc &DL, SelectionDAG &DAG,
54627 const X86Subtarget &Subtarget) {
54628 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
54629 VT.getVectorElementType() == MVT::i1 &&
54630 (OpVT.getVectorElementType() == MVT::i8 ||
54631 OpVT.getVectorElementType() == MVT::i16)) {
54632 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
54633 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
54634 }
54635 return SDValue();
54636}
54637
54638static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
54639 TargetLowering::DAGCombinerInfo &DCI,
54640 const X86Subtarget &Subtarget) {
54641 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
54642 const SDValue LHS = N->getOperand(0);
54643 const SDValue RHS = N->getOperand(1);
54644 EVT VT = N->getValueType(0);
54645 EVT OpVT = LHS.getValueType();
54646 SDLoc DL(N);
54647
54648 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
54649 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
54650 Subtarget))
54651 return V;
54652
54653 if (VT == MVT::i1) {
54654 X86::CondCode X86CC;
54655 if (SDValue V =
54656 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
54657 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
54658 }
54659
54660 if (OpVT.isScalarInteger()) {
54661 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
54662 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
54663 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
54664 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
54665 if (N0.getOperand(0) == N1)
54666 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54667 N0.getOperand(1));
54668 if (N0.getOperand(1) == N1)
54669 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54670 N0.getOperand(0));
54671 }
54672 return SDValue();
54673 };
54674 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
54675 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54676 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
54677 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54678
54679 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
54680 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
54681 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
54682 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
54683 if (N0.getOperand(0) == N1)
54684 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54685 DAG.getNOT(DL, N0.getOperand(1), OpVT));
54686 if (N0.getOperand(1) == N1)
54687 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54688 DAG.getNOT(DL, N0.getOperand(0), OpVT));
54689 }
54690 return SDValue();
54691 };
54692 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
54693 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54694 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
54695 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54696
54697 // cmpeq(trunc(x),C) --> cmpeq(x,C)
54698 // cmpne(trunc(x),C) --> cmpne(x,C)
54699 // iff x upper bits are zero.
54700 if (LHS.getOpcode() == ISD::TRUNCATE &&
54701 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
54702 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
54703 EVT SrcVT = LHS.getOperand(0).getValueType();
54704 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
54705 OpVT.getScalarSizeInBits());
54706 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54707 auto *C = cast<ConstantSDNode>(RHS);
54708 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
54709 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
54710 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
54711 DAG.getConstant(C->getAPIntValue().zextOrTrunc(
54712 SrcVT.getScalarSizeInBits()),
54713 DL, SrcVT),
54714 CC);
54715 }
54716
54717 // With C as a power of 2 and C != 0 and C != INT_MIN:
54718 // icmp eq Abs(X) C ->
54719 // (icmp eq A, C) | (icmp eq A, -C)
54720 // icmp ne Abs(X) C ->
54721 // (icmp ne A, C) & (icmp ne A, -C)
54722 // Both of these patterns can be better optimized in
54723 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
54724 // integers which is checked above.
54725 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
54726 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
54727 const APInt &CInt = C->getAPIntValue();
54728 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
54729 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
54730 SDValue BaseOp = LHS.getOperand(0);
54731 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
54732 SDValue SETCC1 = DAG.getSetCC(
54733 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
54734 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
54735 SETCC0, SETCC1);
54736 }
54737 }
54738 }
54739 }
54740 }
54741
54742 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
54743 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
54744 // Using temporaries to avoid messing up operand ordering for later
54745 // transformations if this doesn't work.
54746 SDValue Op0 = LHS;
54747 SDValue Op1 = RHS;
54748 ISD::CondCode TmpCC = CC;
54749 // Put build_vector on the right.
54750 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
54751 std::swap(Op0, Op1);
54752 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
54753 }
54754
54755 bool IsSEXT0 =
54756 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
54757 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
54758 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
54759
54760 if (IsSEXT0 && IsVZero1) {
54761 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))
54762 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))
;
54763 if (TmpCC == ISD::SETGT)
54764 return DAG.getConstant(0, DL, VT);
54765 if (TmpCC == ISD::SETLE)
54766 return DAG.getConstant(1, DL, VT);
54767 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
54768 return DAG.getNOT(DL, Op0.getOperand(0), VT);
54769
54770 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__
__PRETTY_FUNCTION__))
54771 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__
__PRETTY_FUNCTION__))
;
54772 return Op0.getOperand(0);
54773 }
54774 }
54775
54776 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
54777 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
54778 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
54779 // a mask, there are signed AVX512 comparisons).
54780 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
54781 bool CanMakeSigned = false;
54782 if (ISD::isUnsignedIntSetCC(CC)) {
54783 KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),
54784 DAG.computeKnownBits(RHS));
54785 // If we know LHS/RHS share the same sign bit at each element we can
54786 // make this signed.
54787 // NOTE: `computeKnownBits` on a vector type aggregates common bits
54788 // across all lanes. So a pattern where the sign varies from lane to
54789 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
54790 // missed. We could get around this by demanding each lane
54791 // independently, but this isn't the most important optimization and
54792 // that may eat into compile time.
54793 CanMakeSigned =
54794 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
54795 }
54796 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
54797 SDValue LHSOut = LHS;
54798 SDValue RHSOut = RHS;
54799 ISD::CondCode NewCC = CC;
54800 switch (CC) {
54801 case ISD::SETGE:
54802 case ISD::SETUGE:
54803 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
54804 /*NSW*/ true))
54805 LHSOut = NewLHS;
54806 else if (SDValue NewRHS = incDecVectorConstant(
54807 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
54808 RHSOut = NewRHS;
54809 else
54810 break;
54811
54812 [[fallthrough]];
54813 case ISD::SETUGT:
54814 NewCC = ISD::SETGT;
54815 break;
54816
54817 case ISD::SETLE:
54818 case ISD::SETULE:
54819 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
54820 /*NSW*/ true))
54821 LHSOut = NewLHS;
54822 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
54823 /*NSW*/ true))
54824 RHSOut = NewRHS;
54825 else
54826 break;
54827
54828 [[fallthrough]];
54829 case ISD::SETULT:
54830 // Will be swapped to SETGT in LowerVSETCC*.
54831 NewCC = ISD::SETLT;
54832 break;
54833 default:
54834 break;
54835 }
54836 if (NewCC != CC) {
54837 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
54838 NewCC, DL, DAG, Subtarget))
54839 return R;
54840 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
54841 }
54842 }
54843 }
54844
54845 if (SDValue R =
54846 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
54847 return R;
54848
54849 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
54850 // to avoid scalarization via legalization because v4i32 is not a legal type.
54851 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
54852 LHS.getValueType() == MVT::v4f32)
54853 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
54854
54855 // X pred 0.0 --> X pred -X
54856 // If the negation of X already exists, use it in the comparison. This removes
54857 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
54858 // instructions in patterns with a 'select' node.
54859 if (isNullFPScalarOrVectorConst(RHS)) {
54860 SDVTList FNegVT = DAG.getVTList(OpVT);
54861 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
54862 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
54863 }
54864
54865 return SDValue();
54866}
54867
54868static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
54869 TargetLowering::DAGCombinerInfo &DCI,
54870 const X86Subtarget &Subtarget) {
54871 SDValue Src = N->getOperand(0);
54872 MVT SrcVT = Src.getSimpleValueType();
54873 MVT VT = N->getSimpleValueType(0);
54874 unsigned NumBits = VT.getScalarSizeInBits();
54875 unsigned NumElts = SrcVT.getVectorNumElements();
54876 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
54877 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54877, __extension__
__PRETTY_FUNCTION__))
;
54878
54879 // Perform constant folding.
54880 APInt UndefElts;
54881 SmallVector<APInt, 32> EltBits;
54882 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
54883 APInt Imm(32, 0);
54884 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
54885 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54886 Imm.setBit(Idx);
54887
54888 return DAG.getConstant(Imm, SDLoc(N), VT);
54889 }
54890
54891 // Look through int->fp bitcasts that don't change the element width.
54892 unsigned EltWidth = SrcVT.getScalarSizeInBits();
54893 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
54894 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
54895 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
54896
54897 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
54898 // with scalar comparisons.
54899 if (SDValue NotSrc = IsNOT(Src, DAG)) {
54900 SDLoc DL(N);
54901 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54902 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
54903 return DAG.getNode(ISD::XOR, DL, VT,
54904 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
54905 DAG.getConstant(NotMask, DL, VT));
54906 }
54907
54908 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
54909 // results with scalar comparisons.
54910 if (Src.getOpcode() == X86ISD::PCMPGT &&
54911 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
54912 SDLoc DL(N);
54913 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54914 return DAG.getNode(ISD::XOR, DL, VT,
54915 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
54916 DAG.getConstant(NotMask, DL, VT));
54917 }
54918
54919 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
54920 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
54921 // iff pow2splat(c1).
54922 // Use KnownBits to determine if only a single bit is non-zero
54923 // in each element (pow2 or zero), and shift that bit to the msb.
54924 if (Src.getOpcode() == X86ISD::PCMPEQ) {
54925 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
54926 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
54927 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
54928 if (KnownLHS.countMaxPopulation() == 1 &&
54929 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
54930 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
54931 SDLoc DL(N);
54932 MVT ShiftVT = SrcVT;
54933 SDValue ShiftLHS = Src.getOperand(0);
54934 SDValue ShiftRHS = Src.getOperand(1);
54935 if (ShiftVT.getScalarType() == MVT::i8) {
54936 // vXi8 shifts - we only care about the signbit so can use PSLLW.
54937 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
54938 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
54939 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
54940 }
54941 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54942 ShiftLHS, ShiftAmt, DAG);
54943 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54944 ShiftRHS, ShiftAmt, DAG);
54945 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
54946 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
54947 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
54948 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
54949 }
54950 }
54951
54952 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54953 if (N->isOnlyUserOf(Src.getNode())) {
54954 SDValue SrcBC = peekThroughOneUseBitcasts(Src);
54955 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
54956 APInt UndefElts;
54957 SmallVector<APInt, 32> EltBits;
54958 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
54959 UndefElts, EltBits)) {
54960 APInt Mask = APInt::getZero(NumBits);
54961 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
54962 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54963 Mask.setBit(Idx);
54964 }
54965 SDLoc DL(N);
54966 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
54967 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
54968 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
54969 DAG.getConstant(Mask, DL, VT));
54970 }
54971 }
54972 }
54973
54974 // Simplify the inputs.
54975 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54976 APInt DemandedMask(APInt::getAllOnes(NumBits));
54977 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54978 return SDValue(N, 0);
54979
54980 return SDValue();
54981}
54982
54983static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,
54984 TargetLowering::DAGCombinerInfo &DCI,
54985 const X86Subtarget &Subtarget) {
54986 MVT VT = N->getSimpleValueType(0);
54987 unsigned NumBits = VT.getScalarSizeInBits();
54988
54989 // Simplify the inputs.
54990 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54991 APInt DemandedMask(APInt::getAllOnes(NumBits));
54992 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54993 return SDValue(N, 0);
54994
54995 return SDValue();
54996}
54997
54998static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
54999 TargetLowering::DAGCombinerInfo &DCI,
55000 const X86Subtarget &Subtarget) {
55001 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
55002 SDValue BasePtr = MemOp->getBasePtr();
55003 SDValue Index = MemOp->getIndex();
55004 SDValue Scale = MemOp->getScale();
55005 SDValue Mask = MemOp->getMask();
55006
55007 // Attempt to fold an index scale into the scale value directly.
55008 // For smaller indices, implicit sext is performed BEFORE scale, preventing
55009 // this fold under most circumstances.
55010 // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
55011 if ((Index.getOpcode() == X86ISD::VSHLI ||
55012 (Index.getOpcode() == ISD::ADD &&
55013 Index.getOperand(0) == Index.getOperand(1))) &&
55014 isa<ConstantSDNode>(Scale) &&
55015 BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
55016 unsigned ShiftAmt =
55017 Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
55018 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
55019 uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
55020 if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
55021 SDValue NewIndex = Index.getOperand(0);
55022 SDValue NewScale =
55023 DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
55024 if (N->getOpcode() == X86ISD::MGATHER)
55025 return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
55026 MemOp->getOperand(1), Mask,
55027 MemOp->getBasePtr(), NewIndex, NewScale,
55028 MemOp->getChain(), Subtarget);
55029 if (N->getOpcode() == X86ISD::MSCATTER)
55030 return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
55031 MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
55032 NewIndex, NewScale, MemOp->getChain(), Subtarget);
55033 }
55034 }
55035
55036 // With vector masks we only demand the upper bit of the mask.
55037 if (Mask.getScalarValueSizeInBits() != 1) {
55038 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55039 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55040 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55041 if (N->getOpcode() != ISD::DELETED_NODE)
55042 DCI.AddToWorklist(N);
55043 return SDValue(N, 0);
55044 }
55045 }
55046
55047 return SDValue();
55048}
55049
55050static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
55051 SDValue Index, SDValue Base, SDValue Scale,
55052 SelectionDAG &DAG) {
55053 SDLoc DL(GorS);
55054
55055 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
55056 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55057 Gather->getMask(), Base, Index, Scale } ;
55058 return DAG.getMaskedGather(Gather->getVTList(),
55059 Gather->getMemoryVT(), DL, Ops,
55060 Gather->getMemOperand(),
55061 Gather->getIndexType(),
55062 Gather->getExtensionType());
55063 }
55064 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
55065 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55066 Scatter->getMask(), Base, Index, Scale };
55067 return DAG.getMaskedScatter(Scatter->getVTList(),
55068 Scatter->getMemoryVT(), DL,
55069 Ops, Scatter->getMemOperand(),
55070 Scatter->getIndexType(),
55071 Scatter->isTruncatingStore());
55072}
55073
55074static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
55075 TargetLowering::DAGCombinerInfo &DCI) {
55076 SDLoc DL(N);
55077 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
55078 SDValue Index = GorS->getIndex();
55079 SDValue Base = GorS->getBasePtr();
55080 SDValue Scale = GorS->getScale();
55081
55082 if (DCI.isBeforeLegalize()) {
55083 unsigned IndexWidth = Index.getScalarValueSizeInBits();
55084
55085 // Shrink constant indices if they are larger than 32-bits.
55086 // Only do this before legalize types since v2i64 could become v2i32.
55087 // FIXME: We could check that the type is legal if we're after legalize
55088 // types, but then we would need to construct test cases where that happens.
55089 // FIXME: We could support more than just constant vectors, but we need to
55090 // careful with costing. A truncate that can be optimized out would be fine.
55091 // Otherwise we might only want to create a truncate if it avoids a split.
55092 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
55093 if (BV->isConstant() && IndexWidth > 32 &&
55094 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55095 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55096 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55097 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55098 }
55099 }
55100
55101 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
55102 // there are sufficient sign bits. Only do this before legalize types to
55103 // avoid creating illegal types in truncate.
55104 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
55105 Index.getOpcode() == ISD::ZERO_EXTEND) &&
55106 IndexWidth > 32 &&
55107 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
55108 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55109 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55110 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55111 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55112 }
55113 }
55114
55115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55116 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
55117 // Try to move splat constant adders from the index operand to the base
55118 // pointer operand. Taking care to multiply by the scale. We can only do
55119 // this when index element type is the same as the pointer type.
55120 // Otherwise we need to be sure the math doesn't wrap before the scale.
55121 if (Index.getOpcode() == ISD::ADD &&
55122 Index.getValueType().getVectorElementType() == PtrVT &&
55123 isa<ConstantSDNode>(Scale)) {
55124 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
55125 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
55126 BitVector UndefElts;
55127 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
55128 // FIXME: Allow non-constant?
55129 if (UndefElts.none()) {
55130 // Apply the scale.
55131 APInt Adder = C->getAPIntValue() * ScaleAmt;
55132 // Add it to the existing base.
55133 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
55134 DAG.getConstant(Adder, DL, PtrVT));
55135 Index = Index.getOperand(0);
55136 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55137 }
55138 }
55139
55140 // It's also possible base is just a constant. In that case, just
55141 // replace it with 0 and move the displacement into the index.
55142 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
55143 isOneConstant(Scale)) {
55144 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
55145 // Combine the constant build_vector and the constant base.
55146 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
55147 Index.getOperand(1), Splat);
55148 // Add to the LHS of the original Index add.
55149 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
55150 Index.getOperand(0), Splat);
55151 Base = DAG.getConstant(0, DL, Base.getValueType());
55152 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55153 }
55154 }
55155 }
55156
55157 if (DCI.isBeforeLegalizeOps()) {
55158 unsigned IndexWidth = Index.getScalarValueSizeInBits();
55159
55160 // Make sure the index is either i32 or i64
55161 if (IndexWidth != 32 && IndexWidth != 64) {
55162 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
55163 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
55164 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
55165 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55166 }
55167 }
55168
55169 // With vector masks we only demand the upper bit of the mask.
55170 SDValue Mask = GorS->getMask();
55171 if (Mask.getScalarValueSizeInBits() != 1) {
55172 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55173 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55174 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55175 if (N->getOpcode() != ISD::DELETED_NODE)
55176 DCI.AddToWorklist(N);
55177 return SDValue(N, 0);
55178 }
55179 }
55180
55181 return SDValue();
55182}
55183
55184// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
55185static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
55186 const X86Subtarget &Subtarget) {
55187 SDLoc DL(N);
55188 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
55189 SDValue EFLAGS = N->getOperand(1);
55190
55191 // Try to simplify the EFLAGS and condition code operands.
55192 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
55193 return getSETCC(CC, Flags, DL, DAG);
55194
55195 return SDValue();
55196}
55197
55198/// Optimize branch condition evaluation.
55199static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
55200 const X86Subtarget &Subtarget) {
55201 SDLoc DL(N);
55202 SDValue EFLAGS = N->getOperand(3);
55203 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
55204
55205 // Try to simplify the EFLAGS and condition code operands.
55206 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
55207 // RAUW them under us.
55208 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
55209 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
55210 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
55211 N->getOperand(1), Cond, Flags);
55212 }
55213
55214 return SDValue();
55215}
55216
55217// TODO: Could we move this to DAGCombine?
55218static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
55219 SelectionDAG &DAG) {
55220 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
55221 // to optimize away operation when it's from a constant.
55222 //
55223 // The general transformation is:
55224 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
55225 // AND(VECTOR_CMP(x,y), constant2)
55226 // constant2 = UNARYOP(constant)
55227
55228 // Early exit if this isn't a vector operation, the operand of the
55229 // unary operation isn't a bitwise AND, or if the sizes of the operations
55230 // aren't the same.
55231 EVT VT = N->getValueType(0);
55232 bool IsStrict = N->isStrictFPOpcode();
55233 unsigned NumEltBits = VT.getScalarSizeInBits();
55234 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55235 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
55236 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
55237 VT.getSizeInBits() != Op0.getValueSizeInBits())
55238 return SDValue();
55239
55240 // Now check that the other operand of the AND is a constant. We could
55241 // make the transformation for non-constant splats as well, but it's unclear
55242 // that would be a benefit as it would not eliminate any operations, just
55243 // perform one more step in scalar code before moving to the vector unit.
55244 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
55245 // Bail out if the vector isn't a constant.
55246 if (!BV->isConstant())
55247 return SDValue();
55248
55249 // Everything checks out. Build up the new and improved node.
55250 SDLoc DL(N);
55251 EVT IntVT = BV->getValueType(0);
55252 // Create a new constant of the appropriate type for the transformed
55253 // DAG.
55254 SDValue SourceConst;
55255 if (IsStrict)
55256 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
55257 {N->getOperand(0), SDValue(BV, 0)});
55258 else
55259 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
55260 // The AND node needs bitcasts to/from an integer vector type around it.
55261 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
55262 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
55263 MaskConst);
55264 SDValue Res = DAG.getBitcast(VT, NewAnd);
55265 if (IsStrict)
55266 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
55267 return Res;
55268 }
55269
55270 return SDValue();
55271}
55272
55273/// If we are converting a value to floating-point, try to replace scalar
55274/// truncate of an extracted vector element with a bitcast. This tries to keep
55275/// the sequence on XMM registers rather than moving between vector and GPRs.
55276static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
55277 // TODO: This is currently only used by combineSIntToFP, but it is generalized
55278 // to allow being called by any similar cast opcode.
55279 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
55280 SDValue Trunc = N->getOperand(0);
55281 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
55282 return SDValue();
55283
55284 SDValue ExtElt = Trunc.getOperand(0);
55285 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55286 !isNullConstant(ExtElt.getOperand(1)))
55287 return SDValue();
55288
55289 EVT TruncVT = Trunc.getValueType();
55290 EVT SrcVT = ExtElt.getValueType();
55291 unsigned DestWidth = TruncVT.getSizeInBits();
55292 unsigned SrcWidth = SrcVT.getSizeInBits();
55293 if (SrcWidth % DestWidth != 0)
55294 return SDValue();
55295
55296 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
55297 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
55298 unsigned VecWidth = SrcVecVT.getSizeInBits();
55299 unsigned NumElts = VecWidth / DestWidth;
55300 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
55301 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
55302 SDLoc DL(N);
55303 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
55304 BitcastVec, ExtElt.getOperand(1));
55305 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
55306}
55307
55308static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
55309 const X86Subtarget &Subtarget) {
55310 bool IsStrict = N->isStrictFPOpcode();
55311 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55312 EVT VT = N->getValueType(0);
55313 EVT InVT = Op0.getValueType();
55314
55315 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
55316 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
55317 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
55318 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55319 unsigned ScalarSize = InVT.getScalarSizeInBits();
55320 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55321 return SDValue();
55322 SDLoc dl(N);
55323 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55324 ScalarSize < 16 ? MVT::i16
55325 : ScalarSize < 32 ? MVT::i32
55326 : MVT::i64,
55327 InVT.getVectorNumElements());
55328 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55329 if (IsStrict)
55330 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
55331 {N->getOperand(0), P});
55332 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
55333 }
55334
55335 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
55336 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
55337 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
55338 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55339 VT.getScalarType() != MVT::f16) {
55340 SDLoc dl(N);
55341 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55342 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55343
55344 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
55345 if (IsStrict)
55346 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55347 {N->getOperand(0), P});
55348 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55349 }
55350
55351 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
55352 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
55353 // the optimization here.
55354 if (DAG.SignBitIsZero(Op0)) {
55355 if (IsStrict)
55356 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
55357 {N->getOperand(0), Op0});
55358 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
55359 }
55360
55361 return SDValue();
55362}
55363
55364static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
55365 TargetLowering::DAGCombinerInfo &DCI,
55366 const X86Subtarget &Subtarget) {
55367 // First try to optimize away the conversion entirely when it's
55368 // conditionally from a constant. Vectors only.
55369 bool IsStrict = N->isStrictFPOpcode();
55370 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
55371 return Res;
55372
55373 // Now move on to more general possibilities.
55374 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55375 EVT VT = N->getValueType(0);
55376 EVT InVT = Op0.getValueType();
55377
55378 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
55379 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
55380 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
55381 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55382 unsigned ScalarSize = InVT.getScalarSizeInBits();
55383 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55384 return SDValue();
55385 SDLoc dl(N);
55386 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55387 ScalarSize < 16 ? MVT::i16
55388 : ScalarSize < 32 ? MVT::i32
55389 : MVT::i64,
55390 InVT.getVectorNumElements());
55391 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55392 if (IsStrict)
55393 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55394 {N->getOperand(0), P});
55395 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55396 }
55397
55398 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
55399 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
55400 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
55401 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55402 VT.getScalarType() != MVT::f16) {
55403 SDLoc dl(N);
55404 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55405 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55406 if (IsStrict)
55407 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55408 {N->getOperand(0), P});
55409 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55410 }
55411
55412 // Without AVX512DQ we only support i64 to float scalar conversion. For both
55413 // vectors and scalars, see if we know that the upper bits are all the sign
55414 // bit, in which case we can truncate the input to i32 and convert from that.
55415 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
55416 unsigned BitWidth = InVT.getScalarSizeInBits();
55417 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
55418 if (NumSignBits >= (BitWidth - 31)) {
55419 EVT TruncVT = MVT::i32;
55420 if (InVT.isVector())
55421 TruncVT = InVT.changeVectorElementType(TruncVT);
55422 SDLoc dl(N);
55423 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
55424 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
55425 if (IsStrict)
55426 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55427 {N->getOperand(0), Trunc});
55428 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
55429 }
55430 // If we're after legalize and the type is v2i32 we need to shuffle and
55431 // use CVTSI2P.
55432 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55432, __extension__
__PRETTY_FUNCTION__))
;
55433 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
55434 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
55435 { 0, 2, -1, -1 });
55436 if (IsStrict)
55437 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
55438 {N->getOperand(0), Shuf});
55439 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
55440 }
55441 }
55442
55443 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
55444 // a 32-bit target where SSE doesn't support i64->FP operations.
55445 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
55446 Op0.getOpcode() == ISD::LOAD) {
55447 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
55448
55449 // This transformation is not supported if the result type is f16 or f128.
55450 if (VT == MVT::f16 || VT == MVT::f128)
55451 return SDValue();
55452
55453 // If we have AVX512DQ we can use packed conversion instructions unless
55454 // the VT is f80.
55455 if (Subtarget.hasDQI() && VT != MVT::f80)
55456 return SDValue();
55457
55458 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
55459 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
55460 std::pair<SDValue, SDValue> Tmp =
55461 Subtarget.getTargetLowering()->BuildFILD(
55462 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
55463 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
55464 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
55465 return Tmp.first;
55466 }
55467 }
55468
55469 if (IsStrict)
55470 return SDValue();
55471
55472 if (SDValue V = combineToFPTruncExtElt(N, DAG))
55473 return V;
55474
55475 return SDValue();
55476}
55477
55478static bool needCarryOrOverflowFlag(SDValue Flags) {
55479 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55479, __extension__
__PRETTY_FUNCTION__))
;
55480
55481 for (const SDNode *User : Flags->uses()) {
55482 X86::CondCode CC;
55483 switch (User->getOpcode()) {
55484 default:
55485 // Be conservative.
55486 return true;
55487 case X86ISD::SETCC:
55488 case X86ISD::SETCC_CARRY:
55489 CC = (X86::CondCode)User->getConstantOperandVal(0);
55490 break;
55491 case X86ISD::BRCOND:
55492 case X86ISD::CMOV:
55493 CC = (X86::CondCode)User->getConstantOperandVal(2);
55494 break;
55495 }
55496
55497 switch (CC) {
55498 default: break;
55499 case X86::COND_A: case X86::COND_AE:
55500 case X86::COND_B: case X86::COND_BE:
55501 case X86::COND_O: case X86::COND_NO:
55502 case X86::COND_G: case X86::COND_GE:
55503 case X86::COND_L: case X86::COND_LE:
55504 return true;
55505 }
55506 }
55507
55508 return false;
55509}
55510
55511static bool onlyZeroFlagUsed(SDValue Flags) {
55512 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55512, __extension__
__PRETTY_FUNCTION__))
;
55513
55514 for (const SDNode *User : Flags->uses()) {
55515 unsigned CCOpNo;
55516 switch (User->getOpcode()) {
55517 default:
55518 // Be conservative.
55519 return false;
55520 case X86ISD::SETCC:
55521 case X86ISD::SETCC_CARRY:
55522 CCOpNo = 0;
55523 break;
55524 case X86ISD::BRCOND:
55525 case X86ISD::CMOV:
55526 CCOpNo = 2;
55527 break;
55528 }
55529
55530 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
55531 if (CC != X86::COND_E && CC != X86::COND_NE)
55532 return false;
55533 }
55534
55535 return true;
55536}
55537
55538static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
55539 // Only handle test patterns.
55540 if (!isNullConstant(N->getOperand(1)))
55541 return SDValue();
55542
55543 // If we have a CMP of a truncated binop, see if we can make a smaller binop
55544 // and use its flags directly.
55545 // TODO: Maybe we should try promoting compares that only use the zero flag
55546 // first if we can prove the upper bits with computeKnownBits?
55547 SDLoc dl(N);
55548 SDValue Op = N->getOperand(0);
55549 EVT VT = Op.getValueType();
55550
55551 // If we have a constant logical shift that's only used in a comparison
55552 // against zero turn it into an equivalent AND. This allows turning it into
55553 // a TEST instruction later.
55554 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
55555 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
55556 onlyZeroFlagUsed(SDValue(N, 0))) {
55557 unsigned BitWidth = VT.getSizeInBits();
55558 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
55559 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
55560 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
55561 APInt Mask = Op.getOpcode() == ISD::SRL
55562 ? APInt::getHighBitsSet(BitWidth, MaskBits)
55563 : APInt::getLowBitsSet(BitWidth, MaskBits);
55564 if (Mask.isSignedIntN(32)) {
55565 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
55566 DAG.getConstant(Mask, dl, VT));
55567 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55568 DAG.getConstant(0, dl, VT));
55569 }
55570 }
55571 }
55572
55573 // Peek through any zero-extend if we're only testing for a zero result.
55574 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
55575 SDValue Src = Op.getOperand(0);
55576 EVT SrcVT = Src.getValueType();
55577 if (SrcVT.getScalarSizeInBits() >= 8 &&
55578 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
55579 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
55580 DAG.getConstant(0, dl, SrcVT));
55581 }
55582
55583 // Look for a truncate.
55584 if (Op.getOpcode() != ISD::TRUNCATE)
55585 return SDValue();
55586
55587 SDValue Trunc = Op;
55588 Op = Op.getOperand(0);
55589
55590 // See if we can compare with zero against the truncation source,
55591 // which should help using the Z flag from many ops. Only do this for
55592 // i32 truncated op to prevent partial-reg compares of promoted ops.
55593 EVT OpVT = Op.getValueType();
55594 APInt UpperBits =
55595 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
55596 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
55597 onlyZeroFlagUsed(SDValue(N, 0))) {
55598 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55599 DAG.getConstant(0, dl, OpVT));
55600 }
55601
55602 // After this the truncate and arithmetic op must have a single use.
55603 if (!Trunc.hasOneUse() || !Op.hasOneUse())
55604 return SDValue();
55605
55606 unsigned NewOpc;
55607 switch (Op.getOpcode()) {
55608 default: return SDValue();
55609 case ISD::AND:
55610 // Skip and with constant. We have special handling for and with immediate
55611 // during isel to generate test instructions.
55612 if (isa<ConstantSDNode>(Op.getOperand(1)))
55613 return SDValue();
55614 NewOpc = X86ISD::AND;
55615 break;
55616 case ISD::OR: NewOpc = X86ISD::OR; break;
55617 case ISD::XOR: NewOpc = X86ISD::XOR; break;
55618 case ISD::ADD:
55619 // If the carry or overflow flag is used, we can't truncate.
55620 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55621 return SDValue();
55622 NewOpc = X86ISD::ADD;
55623 break;
55624 case ISD::SUB:
55625 // If the carry or overflow flag is used, we can't truncate.
55626 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55627 return SDValue();
55628 NewOpc = X86ISD::SUB;
55629 break;
55630 }
55631
55632 // We found an op we can narrow. Truncate its inputs.
55633 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
55634 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
55635
55636 // Use a X86 specific opcode to avoid DAG combine messing with it.
55637 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55638 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
55639
55640 // For AND, keep a CMP so that we can match the test pattern.
55641 if (NewOpc == X86ISD::AND)
55642 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55643 DAG.getConstant(0, dl, VT));
55644
55645 // Return the flags.
55646 return Op.getValue(1);
55647}
55648
55649static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
55650 TargetLowering::DAGCombinerInfo &DCI) {
55651 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__
__PRETTY_FUNCTION__))
55652 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__
__PRETTY_FUNCTION__))
;
55653
55654 SDLoc DL(N);
55655 SDValue LHS = N->getOperand(0);
55656 SDValue RHS = N->getOperand(1);
55657 MVT VT = LHS.getSimpleValueType();
55658 bool IsSub = X86ISD::SUB == N->getOpcode();
55659 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
55660
55661 // If we don't use the flag result, simplify back to a generic ADD/SUB.
55662 if (!N->hasAnyUseOfValue(1)) {
55663 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
55664 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
55665 }
55666
55667 // Fold any similar generic ADD/SUB opcodes to reuse this node.
55668 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
55669 SDValue Ops[] = {N0, N1};
55670 SDVTList VTs = DAG.getVTList(N->getValueType(0));
55671 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
55672 SDValue Op(N, 0);
55673 if (Negate)
55674 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
55675 DCI.CombineTo(GenericAddSub, Op);
55676 }
55677 };
55678 MatchGeneric(LHS, RHS, false);
55679 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
55680
55681 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
55682 // EFLAGS result doesn't change.
55683 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
55684 /*ZeroSecondOpOnly*/ true);
55685}
55686
55687static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
55688 SDValue LHS = N->getOperand(0);
55689 SDValue RHS = N->getOperand(1);
55690 SDValue BorrowIn = N->getOperand(2);
55691
55692 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
55693 MVT VT = N->getSimpleValueType(0);
55694 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55695 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
55696 }
55697
55698 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
55699 // iff the flag result is dead.
55700 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
55701 !N->hasAnyUseOfValue(1))
55702 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55703 LHS.getOperand(1), BorrowIn);
55704
55705 return SDValue();
55706}
55707
55708// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
55709static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
55710 TargetLowering::DAGCombinerInfo &DCI) {
55711 SDValue LHS = N->getOperand(0);
55712 SDValue RHS = N->getOperand(1);
55713 SDValue CarryIn = N->getOperand(2);
55714 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
55715 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
55716
55717 // Canonicalize constant to RHS.
55718 if (LHSC && !RHSC)
55719 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
55720 CarryIn);
55721
55722 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
55723 // the result is either zero or one (depending on the input carry bit).
55724 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
55725 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
55726 // We don't have a good way to replace an EFLAGS use, so only do this when
55727 // dead right now.
55728 SDValue(N, 1).use_empty()) {
55729 SDLoc DL(N);
55730 EVT VT = N->getValueType(0);
55731 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
55732 SDValue Res1 = DAG.getNode(
55733 ISD::AND, DL, VT,
55734 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
55735 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
55736 DAG.getConstant(1, DL, VT));
55737 return DCI.CombineTo(N, Res1, CarryOut);
55738 }
55739
55740 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
55741 // iff the flag result is dead.
55742 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
55743 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
55744 SDLoc DL(N);
55745 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
55746 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
55747 DAG.getConstant(0, DL, LHS.getValueType()),
55748 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
55749 }
55750
55751 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
55752 MVT VT = N->getSimpleValueType(0);
55753 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55754 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
55755 }
55756
55757 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
55758 // iff the flag result is dead.
55759 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
55760 !N->hasAnyUseOfValue(1))
55761 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55762 LHS.getOperand(1), CarryIn);
55763
55764 return SDValue();
55765}
55766
55767static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
55768 const SDLoc &DL, EVT VT,
55769 const X86Subtarget &Subtarget) {
55770 // Example of pattern we try to detect:
55771 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
55772 //(add (build_vector (extract_elt t, 0),
55773 // (extract_elt t, 2),
55774 // (extract_elt t, 4),
55775 // (extract_elt t, 6)),
55776 // (build_vector (extract_elt t, 1),
55777 // (extract_elt t, 3),
55778 // (extract_elt t, 5),
55779 // (extract_elt t, 7)))
55780
55781 if (!Subtarget.hasSSE2())
55782 return SDValue();
55783
55784 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
55785 Op1.getOpcode() != ISD::BUILD_VECTOR)
55786 return SDValue();
55787
55788 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55789 VT.getVectorNumElements() < 4 ||
55790 !isPowerOf2_32(VT.getVectorNumElements()))
55791 return SDValue();
55792
55793 // Check if one of Op0,Op1 is of the form:
55794 // (build_vector (extract_elt Mul, 0),
55795 // (extract_elt Mul, 2),
55796 // (extract_elt Mul, 4),
55797 // ...
55798 // the other is of the form:
55799 // (build_vector (extract_elt Mul, 1),
55800 // (extract_elt Mul, 3),
55801 // (extract_elt Mul, 5),
55802 // ...
55803 // and identify Mul.
55804 SDValue Mul;
55805 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
55806 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
55807 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
55808 // TODO: Be more tolerant to undefs.
55809 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55810 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55811 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55812 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55813 return SDValue();
55814 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
55815 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
55816 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
55817 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
55818 if (!Const0L || !Const1L || !Const0H || !Const1H)
55819 return SDValue();
55820 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
55821 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
55822 // Commutativity of mul allows factors of a product to reorder.
55823 if (Idx0L > Idx1L)
55824 std::swap(Idx0L, Idx1L);
55825 if (Idx0H > Idx1H)
55826 std::swap(Idx0H, Idx1H);
55827 // Commutativity of add allows pairs of factors to reorder.
55828 if (Idx0L > Idx0H) {
55829 std::swap(Idx0L, Idx0H);
55830 std::swap(Idx1L, Idx1H);
55831 }
55832 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
55833 Idx1H != 2 * i + 3)
55834 return SDValue();
55835 if (!Mul) {
55836 // First time an extract_elt's source vector is visited. Must be a MUL
55837 // with 2X number of vector elements than the BUILD_VECTOR.
55838 // Both extracts must be from same MUL.
55839 Mul = Op0L->getOperand(0);
55840 if (Mul->getOpcode() != ISD::MUL ||
55841 Mul.getValueType().getVectorNumElements() != 2 * e)
55842 return SDValue();
55843 }
55844 // Check that the extract is from the same MUL previously seen.
55845 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
55846 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
55847 return SDValue();
55848 }
55849
55850 // Check if the Mul source can be safely shrunk.
55851 ShrinkMode Mode;
55852 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
55853 Mode == ShrinkMode::MULU16)
55854 return SDValue();
55855
55856 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55857 VT.getVectorNumElements() * 2);
55858 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
55859 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
55860
55861 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55862 ArrayRef<SDValue> Ops) {
55863 EVT InVT = Ops[0].getValueType();
55864 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55864, __extension__
__PRETTY_FUNCTION__))
;
55865 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55866 InVT.getVectorNumElements() / 2);
55867 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55868 };
55869 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
55870}
55871
55872// Attempt to turn this pattern into PMADDWD.
55873// (add (mul (sext (build_vector)), (sext (build_vector))),
55874// (mul (sext (build_vector)), (sext (build_vector)))
55875static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
55876 const SDLoc &DL, EVT VT,
55877 const X86Subtarget &Subtarget) {
55878 if (!Subtarget.hasSSE2())
55879 return SDValue();
55880
55881 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
55882 return SDValue();
55883
55884 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55885 VT.getVectorNumElements() < 4 ||
55886 !isPowerOf2_32(VT.getVectorNumElements()))
55887 return SDValue();
55888
55889 SDValue N00 = N0.getOperand(0);
55890 SDValue N01 = N0.getOperand(1);
55891 SDValue N10 = N1.getOperand(0);
55892 SDValue N11 = N1.getOperand(1);
55893
55894 // All inputs need to be sign extends.
55895 // TODO: Support ZERO_EXTEND from known positive?
55896 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
55897 N01.getOpcode() != ISD::SIGN_EXTEND ||
55898 N10.getOpcode() != ISD::SIGN_EXTEND ||
55899 N11.getOpcode() != ISD::SIGN_EXTEND)
55900 return SDValue();
55901
55902 // Peek through the extends.
55903 N00 = N00.getOperand(0);
55904 N01 = N01.getOperand(0);
55905 N10 = N10.getOperand(0);
55906 N11 = N11.getOperand(0);
55907
55908 // Must be extending from vXi16.
55909 EVT InVT = N00.getValueType();
55910 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
55911 N10.getValueType() != InVT || N11.getValueType() != InVT)
55912 return SDValue();
55913
55914 // All inputs should be build_vectors.
55915 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
55916 N01.getOpcode() != ISD::BUILD_VECTOR ||
55917 N10.getOpcode() != ISD::BUILD_VECTOR ||
55918 N11.getOpcode() != ISD::BUILD_VECTOR)
55919 return SDValue();
55920
55921 // For each element, we need to ensure we have an odd element from one vector
55922 // multiplied by the odd element of another vector and the even element from
55923 // one of the same vectors being multiplied by the even element from the
55924 // other vector. So we need to make sure for each element i, this operator
55925 // is being performed:
55926 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
55927 SDValue In0, In1;
55928 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
55929 SDValue N00Elt = N00.getOperand(i);
55930 SDValue N01Elt = N01.getOperand(i);
55931 SDValue N10Elt = N10.getOperand(i);
55932 SDValue N11Elt = N11.getOperand(i);
55933 // TODO: Be more tolerant to undefs.
55934 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55935 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55936 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55937 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55938 return SDValue();
55939 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
55940 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
55941 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
55942 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
55943 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
55944 return SDValue();
55945 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55946 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55947 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55948 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55949 // Add is commutative so indices can be reordered.
55950 if (IdxN00 > IdxN10) {
55951 std::swap(IdxN00, IdxN10);
55952 std::swap(IdxN01, IdxN11);
55953 }
55954 // N0 indices be the even element. N1 indices must be the next odd element.
55955 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
55956 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
55957 return SDValue();
55958 SDValue N00In = N00Elt.getOperand(0);
55959 SDValue N01In = N01Elt.getOperand(0);
55960 SDValue N10In = N10Elt.getOperand(0);
55961 SDValue N11In = N11Elt.getOperand(0);
55962
55963 // First time we find an input capture it.
55964 if (!In0) {
55965 In0 = N00In;
55966 In1 = N01In;
55967
55968 // The input vectors must be at least as wide as the output.
55969 // If they are larger than the output, we extract subvector below.
55970 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
55971 In1.getValueSizeInBits() < VT.getSizeInBits())
55972 return SDValue();
55973 }
55974 // Mul is commutative so the input vectors can be in any order.
55975 // Canonicalize to make the compares easier.
55976 if (In0 != N00In)
55977 std::swap(N00In, N01In);
55978 if (In0 != N10In)
55979 std::swap(N10In, N11In);
55980 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
55981 return SDValue();
55982 }
55983
55984 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55985 ArrayRef<SDValue> Ops) {
55986 EVT OpVT = Ops[0].getValueType();
55987 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__
__PRETTY_FUNCTION__))
55988 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__
__PRETTY_FUNCTION__))
;
55989 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55989, __extension__
__PRETTY_FUNCTION__))
;
55990 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55991 OpVT.getVectorNumElements() / 2);
55992 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55993 };
55994
55995 // If the output is narrower than an input, extract the low part of the input
55996 // vector.
55997 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55998 VT.getVectorNumElements() * 2);
55999 if (OutVT16.bitsLT(In0.getValueType())) {
56000 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
56001 DAG.getIntPtrConstant(0, DL));
56002 }
56003 if (OutVT16.bitsLT(In1.getValueType())) {
56004 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
56005 DAG.getIntPtrConstant(0, DL));
56006 }
56007 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
56008 PMADDBuilder);
56009}
56010
56011// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
56012// If upper element in each pair of both VPMADDWD are zero then we can merge
56013// the operand elements and use the implicit add of VPMADDWD.
56014// TODO: Add support for VPMADDUBSW (which isn't commutable).
56015static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
56016 const SDLoc &DL, EVT VT) {
56017 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
56018 return SDValue();
56019
56020 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
56021 if (VT.getSizeInBits() > 128)
56022 return SDValue();
56023
56024 unsigned NumElts = VT.getVectorNumElements();
56025 MVT OpVT = N0.getOperand(0).getSimpleValueType();
56026 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
56027 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
56028
56029 bool Op0HiZero =
56030 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
56031 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
56032 bool Op1HiZero =
56033 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
56034 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
56035
56036 // TODO: Check for zero lower elements once we have actual codegen that
56037 // creates them.
56038 if (!Op0HiZero || !Op1HiZero)
56039 return SDValue();
56040
56041 // Create a shuffle mask packing the lower elements from each VPMADDWD.
56042 SmallVector<int> Mask;
56043 for (int i = 0; i != (int)NumElts; ++i) {
56044 Mask.push_back(2 * i);
56045 Mask.push_back(2 * (i + NumElts));
56046 }
56047
56048 SDValue LHS =
56049 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
56050 SDValue RHS =
56051 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
56052 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
56053}
56054
56055/// CMOV of constants requires materializing constant operands in registers.
56056/// Try to fold those constants into an 'add' instruction to reduce instruction
56057/// count. We do this with CMOV rather the generic 'select' because there are
56058/// earlier folds that may be used to turn select-of-constants into logic hacks.
56059static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
56060 const X86Subtarget &Subtarget) {
56061 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
56062 // better because we eliminate 1-2 instructions. This transform is still
56063 // an improvement without zero operands because we trade 2 move constants and
56064 // 1 add for 2 adds (LEA) as long as the constants can be represented as
56065 // immediate asm operands (fit in 32-bits).
56066 auto isSuitableCmov = [](SDValue V) {
56067 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
56068 return false;
56069 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
56070 !isa<ConstantSDNode>(V.getOperand(1)))
56071 return false;
56072 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
56073 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
56074 V.getConstantOperandAPInt(1).isSignedIntN(32));
56075 };
56076
56077 // Match an appropriate CMOV as the first operand of the add.
56078 SDValue Cmov = N->getOperand(0);
56079 SDValue OtherOp = N->getOperand(1);
56080 if (!isSuitableCmov(Cmov))
56081 std::swap(Cmov, OtherOp);
56082 if (!isSuitableCmov(Cmov))
56083 return SDValue();
56084
56085 // Don't remove a load folding opportunity for the add. That would neutralize
56086 // any improvements from removing constant materializations.
56087 if (X86::mayFoldLoad(OtherOp, Subtarget))
56088 return SDValue();
56089
56090 EVT VT = N->getValueType(0);
56091 SDLoc DL(N);
56092 SDValue FalseOp = Cmov.getOperand(0);
56093 SDValue TrueOp = Cmov.getOperand(1);
56094
56095 // We will push the add through the select, but we can potentially do better
56096 // if we know there is another add in the sequence and this is pointer math.
56097 // In that case, we can absorb an add into the trailing memory op and avoid
56098 // a 3-operand LEA which is likely slower than a 2-operand LEA.
56099 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
56100 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
56101 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
56102 all_of(N->uses(), [&](SDNode *Use) {
56103 auto *MemNode = dyn_cast<MemSDNode>(Use);
56104 return MemNode && MemNode->getBasePtr().getNode() == N;
56105 })) {
56106 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
56107 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
56108 // it is possible that choosing op1 might be better.
56109 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
56110 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
56111 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
56112 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
56113 Cmov.getOperand(2), Cmov.getOperand(3));
56114 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
56115 }
56116
56117 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
56118 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
56119 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
56120 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
56121 Cmov.getOperand(3));
56122}
56123
56124static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
56125 TargetLowering::DAGCombinerInfo &DCI,
56126 const X86Subtarget &Subtarget) {
56127 EVT VT = N->getValueType(0);
56128 SDValue Op0 = N->getOperand(0);
56129 SDValue Op1 = N->getOperand(1);
56130 SDLoc DL(N);
56131
56132 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
56133 return Select;
56134
56135 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
56136 return MAdd;
56137 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
56138 return MAdd;
56139 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
56140 return MAdd;
56141
56142 // Try to synthesize horizontal adds from adds of shuffles.
56143 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
56144 return V;
56145
56146 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
56147 // (sub Y, (sext (vXi1 X))).
56148 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
56149 // generic DAG combine without a legal type check, but adding this there
56150 // caused regressions.
56151 if (VT.isVector()) {
56152 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56153 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
56154 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56155 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
56156 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
56157 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
56158 }
56159
56160 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
56161 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56162 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
56163 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
56164 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
56165 }
56166 }
56167
56168 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
56169 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
56170 X86::isZeroNode(Op0.getOperand(1))) {
56171 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56171, __extension__
__PRETTY_FUNCTION__))
;
56172 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
56173 Op0.getOperand(0), Op0.getOperand(2));
56174 }
56175
56176 return combineAddOrSubToADCOrSBB(N, DAG);
56177}
56178
56179// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
56180// condition comes from the subtract node that produced -X. This matches the
56181// cmov expansion for absolute value. By swapping the operands we convert abs
56182// to nabs.
56183static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
56184 SDValue N0 = N->getOperand(0);
56185 SDValue N1 = N->getOperand(1);
56186
56187 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
56188 return SDValue();
56189
56190 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
56191 if (CC != X86::COND_S && CC != X86::COND_NS)
56192 return SDValue();
56193
56194 // Condition should come from a negate operation.
56195 SDValue Cond = N1.getOperand(3);
56196 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
56197 return SDValue();
56198 assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56198, __extension__
__PRETTY_FUNCTION__))
;
56199
56200 // Get the X and -X from the negate.
56201 SDValue NegX = Cond.getValue(0);
56202 SDValue X = Cond.getOperand(1);
56203
56204 SDValue FalseOp = N1.getOperand(0);
56205 SDValue TrueOp = N1.getOperand(1);
56206
56207 // Cmov operands should be X and NegX. Order doesn't matter.
56208 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
56209 return SDValue();
56210
56211 // Build a new CMOV with the operands swapped.
56212 SDLoc DL(N);
56213 MVT VT = N->getSimpleValueType(0);
56214 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
56215 N1.getOperand(2), Cond);
56216 // Convert sub to add.
56217 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
56218}
56219
56220static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
56221 SDValue Op0 = N->getOperand(0);
56222 SDValue Op1 = N->getOperand(1);
56223
56224 // (sub C (zero_extend (setcc)))
56225 // =>
56226 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
56227 // Don't disturb (sub 0 setcc), which is easily done with neg.
56228 EVT VT = N->getValueType(0);
56229 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
56230 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
56231 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
56232 Op1.getOperand(0).hasOneUse()) {
56233 SDValue SetCC = Op1.getOperand(0);
56234 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
56235 X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);
56236 uint64_t NewImm = Op0C->getZExtValue() - 1;
56237 SDLoc DL(Op1);
56238 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
56239 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
56240 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
56241 DAG.getConstant(NewImm, DL, VT));
56242 }
56243
56244 return SDValue();
56245}
56246
56247static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
56248 TargetLowering::DAGCombinerInfo &DCI,
56249 const X86Subtarget &Subtarget) {
56250 SDValue Op0 = N->getOperand(0);
56251 SDValue Op1 = N->getOperand(1);
56252
56253 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
56254 auto IsNonOpaqueConstant = [&](SDValue Op) {
56255 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
56256 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
56257 return !Cst->isOpaque();
56258 return true;
56259 }
56260 return false;
56261 };
56262
56263 // X86 can't encode an immediate LHS of a sub. See if we can push the
56264 // negation into a preceding instruction. If the RHS of the sub is a XOR with
56265 // one use and a constant, invert the immediate, saving one register.
56266 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
56267 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
56268 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
56269 SDLoc DL(N);
56270 EVT VT = Op0.getValueType();
56271 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
56272 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
56273 SDValue NewAdd =
56274 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
56275 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
56276 }
56277
56278 if (SDValue V = combineSubABS(N, DAG))
56279 return V;
56280
56281 // Try to synthesize horizontal subs from subs of shuffles.
56282 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
56283 return V;
56284
56285 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
56286 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
56287 X86::isZeroNode(Op1.getOperand(1))) {
56288 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56288, __extension__
__PRETTY_FUNCTION__))
;
56289 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
56290 Op1.getOperand(0), Op1.getOperand(2));
56291 }
56292
56293 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
56294 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
56295 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
56296 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
56297 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56297, __extension__
__PRETTY_FUNCTION__))
;
56298 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
56299 Op1.getOperand(1), Op1.getOperand(2));
56300 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
56301 Op1.getOperand(0));
56302 }
56303
56304 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
56305 return V;
56306
56307 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
56308 return V;
56309
56310 return combineSubSetcc(N, DAG);
56311}
56312
56313static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
56314 const X86Subtarget &Subtarget) {
56315 MVT VT = N->getSimpleValueType(0);
56316 SDLoc DL(N);
56317
56318 if (N->getOperand(0) == N->getOperand(1)) {
56319 if (N->getOpcode() == X86ISD::PCMPEQ)
56320 return DAG.getConstant(-1, DL, VT);
56321 if (N->getOpcode() == X86ISD::PCMPGT)
56322 return DAG.getConstant(0, DL, VT);
56323 }
56324
56325 return SDValue();
56326}
56327
56328/// Helper that combines an array of subvector ops as if they were the operands
56329/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
56330/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
56331static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
56332 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
56333 TargetLowering::DAGCombinerInfo &DCI,
56334 const X86Subtarget &Subtarget) {
56335 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56335, __extension__
__PRETTY_FUNCTION__))
;
56336 unsigned EltSizeInBits = VT.getScalarSizeInBits();
56337
56338 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
56339 return DAG.getUNDEF(VT);
56340
56341 if (llvm::all_of(Ops, [](SDValue Op) {
56342 return ISD::isBuildVectorAllZeros(Op.getNode());
56343 }))
56344 return getZeroVector(VT, Subtarget, DAG, DL);
56345
56346 SDValue Op0 = Ops[0];
56347 bool IsSplat = llvm::all_equal(Ops);
56348
56349 // Repeated subvectors.
56350 if (IsSplat &&
56351 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56352 // If this broadcast is inserted into both halves, use a larger broadcast.
56353 if (Op0.getOpcode() == X86ISD::VBROADCAST)
56354 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
56355
56356 // If this simple subvector or scalar/subvector broadcast_load is inserted
56357 // into both halves, use a larger broadcast_load. Update other uses to use
56358 // an extracted subvector.
56359 if (ISD::isNormalLoad(Op0.getNode()) ||
56360 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
56361 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
56362 auto *Mem = cast<MemSDNode>(Op0);
56363 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
56364 ? X86ISD::VBROADCAST_LOAD
56365 : X86ISD::SUBV_BROADCAST_LOAD;
56366 if (SDValue BcastLd =
56367 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56368 SDValue BcastSrc =
56369 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56370 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56371 return BcastLd;
56372 }
56373 }
56374
56375 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
56376 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
56377 (Subtarget.hasAVX2() ||
56378 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
56379 VT.getScalarType(), Subtarget)))
56380 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
56381 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
56382 Op0.getOperand(0),
56383 DAG.getIntPtrConstant(0, DL)));
56384
56385 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
56386 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
56387 (Subtarget.hasAVX2() ||
56388 (EltSizeInBits >= 32 &&
56389 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
56390 Op0.getOperand(0).getValueType() == VT.getScalarType())
56391 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
56392
56393 // concat_vectors(extract_subvector(broadcast(x)),
56394 // extract_subvector(broadcast(x))) -> broadcast(x)
56395 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56396 Op0.getOperand(0).getValueType() == VT) {
56397 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
56398 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
56399 return Op0.getOperand(0);
56400 }
56401 }
56402
56403 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
56404 // Only concat of subvector high halves which vperm2x128 is best at.
56405 // TODO: This should go in combineX86ShufflesRecursively eventually.
56406 if (VT.is256BitVector() && Ops.size() == 2) {
56407 SDValue Src0 = peekThroughBitcasts(Ops[0]);
56408 SDValue Src1 = peekThroughBitcasts(Ops[1]);
56409 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56410 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
56411 EVT SrcVT0 = Src0.getOperand(0).getValueType();
56412 EVT SrcVT1 = Src1.getOperand(0).getValueType();
56413 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
56414 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
56415 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
56416 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
56417 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
56418 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
56419 DAG.getBitcast(VT, Src0.getOperand(0)),
56420 DAG.getBitcast(VT, Src1.getOperand(0)),
56421 DAG.getTargetConstant(0x31, DL, MVT::i8));
56422 }
56423 }
56424 }
56425
56426 // Repeated opcode.
56427 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
56428 // but it currently struggles with different vector widths.
56429 if (llvm::all_of(Ops, [Op0](SDValue Op) {
56430 return Op.getOpcode() == Op0.getOpcode();
56431 })) {
56432 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
56433 SmallVector<SDValue> Subs;
56434 for (SDValue SubOp : SubOps)
56435 Subs.push_back(SubOp.getOperand(I));
56436 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
56437 };
56438 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
56439 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
56440 SDValue Sub = SubOps[I].getOperand(Op);
56441 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
56442 if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
56443 Sub.getOperand(0).getValueType() != VT ||
56444 Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
56445 return false;
56446 }
56447 return true;
56448 };
56449
56450 unsigned NumOps = Ops.size();
56451 switch (Op0.getOpcode()) {
56452 case X86ISD::VBROADCAST: {
56453 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
56454 return Op.getOperand(0).getValueType().is128BitVector();
56455 })) {
56456 if (VT == MVT::v4f64 || VT == MVT::v4i64)
56457 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
56458 ConcatSubOperand(VT, Ops, 0),
56459 ConcatSubOperand(VT, Ops, 0));
56460 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
56461 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
56462 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
56463 : X86ISD::PSHUFD,
56464 DL, VT, ConcatSubOperand(VT, Ops, 0),
56465 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56466 }
56467 break;
56468 }
56469 case X86ISD::MOVDDUP:
56470 case X86ISD::MOVSHDUP:
56471 case X86ISD::MOVSLDUP: {
56472 if (!IsSplat)
56473 return DAG.getNode(Op0.getOpcode(), DL, VT,
56474 ConcatSubOperand(VT, Ops, 0));
56475 break;
56476 }
56477 case X86ISD::SHUFP: {
56478 // Add SHUFPD support if/when necessary.
56479 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
56480 llvm::all_of(Ops, [Op0](SDValue Op) {
56481 return Op.getOperand(2) == Op0.getOperand(2);
56482 })) {
56483 return DAG.getNode(Op0.getOpcode(), DL, VT,
56484 ConcatSubOperand(VT, Ops, 0),
56485 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56486 }
56487 break;
56488 }
56489 case X86ISD::PSHUFHW:
56490 case X86ISD::PSHUFLW:
56491 case X86ISD::PSHUFD:
56492 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
56493 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
56494 return DAG.getNode(Op0.getOpcode(), DL, VT,
56495 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56496 }
56497 [[fallthrough]];
56498 case X86ISD::VPERMILPI:
56499 if (!IsSplat && VT.getScalarSizeInBits() == 32 &&
56500 (VT.is256BitVector() ||
56501 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56502 all_of(Ops, [&Op0](SDValue Op) {
56503 return Op0.getOperand(1) == Op.getOperand(1);
56504 })) {
56505 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
56506 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
56507 Res =
56508 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
56509 return DAG.getBitcast(VT, Res);
56510 }
56511 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
56512 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
56513 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
56514 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
56515 return DAG.getNode(Op0.getOpcode(), DL, VT,
56516 ConcatSubOperand(VT, Ops, 0),
56517 DAG.getTargetConstant(Idx, DL, MVT::i8));
56518 }
56519 break;
56520 case X86ISD::PSHUFB:
56521 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56522 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56523 return DAG.getNode(Op0.getOpcode(), DL, VT,
56524 ConcatSubOperand(VT, Ops, 0),
56525 ConcatSubOperand(VT, Ops, 1));
56526 }
56527 break;
56528 case X86ISD::VPERMV:
56529 if (!IsSplat && NumOps == 2 &&
56530 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
56531 MVT OpVT = Op0.getSimpleValueType();
56532 int NumSrcElts = OpVT.getVectorNumElements();
56533 SmallVector<int, 64> ConcatMask;
56534 for (unsigned i = 0; i != NumOps; ++i) {
56535 SmallVector<int, 64> SubMask;
56536 SmallVector<SDValue, 2> SubOps;
56537 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56538 SubMask))
56539 break;
56540 for (int M : SubMask) {
56541 if (0 <= M)
56542 M += i * NumSrcElts;
56543 ConcatMask.push_back(M);
56544 }
56545 }
56546 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56547 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
56548 Ops[1].getOperand(1), DAG, DL);
56549 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56550 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56551 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56552 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
56553 }
56554 }
56555 break;
56556 case X86ISD::VPERMV3:
56557 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56558 MVT OpVT = Op0.getSimpleValueType();
56559 int NumSrcElts = OpVT.getVectorNumElements();
56560 SmallVector<int, 64> ConcatMask;
56561 for (unsigned i = 0; i != NumOps; ++i) {
56562 SmallVector<int, 64> SubMask;
56563 SmallVector<SDValue, 2> SubOps;
56564 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56565 SubMask))
56566 break;
56567 for (int M : SubMask) {
56568 if (0 <= M) {
56569 M += M < NumSrcElts ? 0 : NumSrcElts;
56570 M += i * NumSrcElts;
56571 }
56572 ConcatMask.push_back(M);
56573 }
56574 }
56575 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56576 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
56577 Ops[1].getOperand(0), DAG, DL);
56578 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
56579 Ops[1].getOperand(2), DAG, DL);
56580 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56581 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56582 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56583 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
56584 }
56585 }
56586 break;
56587 case ISD::TRUNCATE:
56588 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
56589 EVT SrcVT = Ops[0].getOperand(0).getValueType();
56590 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
56591 SrcVT == Ops[1].getOperand(0).getValueType() &&
56592 Subtarget.useAVX512Regs() &&
56593 Subtarget.getPreferVectorWidth() >= 512 &&
56594 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
56595 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56596 return DAG.getNode(ISD::TRUNCATE, DL, VT,
56597 ConcatSubOperand(NewSrcVT, Ops, 0));
56598 }
56599 }
56600 break;
56601 case X86ISD::VSHLI:
56602 case X86ISD::VSRLI:
56603 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56604 // TODO: Move this to LowerShiftByScalarImmediate?
56605 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
56606 llvm::all_of(Ops, [](SDValue Op) {
56607 return Op.getConstantOperandAPInt(1) == 32;
56608 })) {
56609 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
56610 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
56611 if (Op0.getOpcode() == X86ISD::VSHLI) {
56612 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56613 {8, 0, 8, 2, 8, 4, 8, 6});
56614 } else {
56615 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56616 {1, 8, 3, 8, 5, 8, 7, 8});
56617 }
56618 return DAG.getBitcast(VT, Res);
56619 }
56620 [[fallthrough]];
56621 case X86ISD::VSRAI:
56622 case X86ISD::VSHL:
56623 case X86ISD::VSRL:
56624 case X86ISD::VSRA:
56625 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
56626 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56627 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
56628 llvm::all_of(Ops, [Op0](SDValue Op) {
56629 return Op0.getOperand(1) == Op.getOperand(1);
56630 })) {
56631 return DAG.getNode(Op0.getOpcode(), DL, VT,
56632 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56633 }
56634 break;
56635 case X86ISD::VPERMI:
56636 case X86ISD::VROTLI:
56637 case X86ISD::VROTRI:
56638 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56639 llvm::all_of(Ops, [Op0](SDValue Op) {
56640 return Op0.getOperand(1) == Op.getOperand(1);
56641 })) {
56642 return DAG.getNode(Op0.getOpcode(), DL, VT,
56643 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56644 }
56645 break;
56646 case ISD::AND:
56647 case ISD::OR:
56648 case ISD::XOR:
56649 case X86ISD::ANDNP:
56650 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56651 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56652 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56653 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56654 NumOps * SrcVT.getVectorNumElements());
56655 return DAG.getNode(Op0.getOpcode(), DL, VT,
56656 ConcatSubOperand(SrcVT, Ops, 0),
56657 ConcatSubOperand(SrcVT, Ops, 1));
56658 }
56659 break;
56660 case X86ISD::GF2P8AFFINEQB:
56661 if (!IsSplat &&
56662 (VT.is256BitVector() ||
56663 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56664 llvm::all_of(Ops, [Op0](SDValue Op) {
56665 return Op0.getOperand(2) == Op.getOperand(2);
56666 })) {
56667 return DAG.getNode(Op0.getOpcode(), DL, VT,
56668 ConcatSubOperand(VT, Ops, 0),
56669 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56670 }
56671 break;
56672 case ISD::ADD:
56673 case ISD::SUB:
56674 case ISD::MUL:
56675 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56676 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56677 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
56678 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56679 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56680 NumOps * SrcVT.getVectorNumElements());
56681 return DAG.getNode(Op0.getOpcode(), DL, VT,
56682 ConcatSubOperand(SrcVT, Ops, 0),
56683 ConcatSubOperand(SrcVT, Ops, 1));
56684 }
56685 break;
56686 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
56687 // their latency are short, so here we don't replace them.
56688 case ISD::FDIV:
56689 if (!IsSplat && (VT.is256BitVector() ||
56690 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56691 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56692 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56693 NumOps * SrcVT.getVectorNumElements());
56694 return DAG.getNode(Op0.getOpcode(), DL, VT,
56695 ConcatSubOperand(SrcVT, Ops, 0),
56696 ConcatSubOperand(SrcVT, Ops, 1));
56697 }
56698 break;
56699 case X86ISD::HADD:
56700 case X86ISD::HSUB:
56701 case X86ISD::FHADD:
56702 case X86ISD::FHSUB:
56703 case X86ISD::PACKSS:
56704 case X86ISD::PACKUS:
56705 if (!IsSplat && VT.is256BitVector() &&
56706 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
56707 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56708 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56709 NumOps * SrcVT.getVectorNumElements());
56710 return DAG.getNode(Op0.getOpcode(), DL, VT,
56711 ConcatSubOperand(SrcVT, Ops, 0),
56712 ConcatSubOperand(SrcVT, Ops, 1));
56713 }
56714 break;
56715 case X86ISD::PALIGNR:
56716 if (!IsSplat &&
56717 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56718 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
56719 llvm::all_of(Ops, [Op0](SDValue Op) {
56720 return Op0.getOperand(2) == Op.getOperand(2);
56721 })) {
56722 return DAG.getNode(Op0.getOpcode(), DL, VT,
56723 ConcatSubOperand(VT, Ops, 0),
56724 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56725 }
56726 break;
56727 case ISD::VSELECT:
56728 if (!IsSplat && Subtarget.hasAVX512() &&
56729 (VT.is256BitVector() ||
56730 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56731 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
56732 EVT SelVT = Ops[0].getOperand(0).getValueType();
56733 if (SelVT.getVectorElementType() == MVT::i1) {
56734 SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
56735 Ops.size() * SelVT.getVectorNumElements());
56736 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56737 return DAG.getNode(Op0.getOpcode(), DL, VT,
56738 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56739 ConcatSubOperand(VT, Ops, 1),
56740 ConcatSubOperand(VT, Ops, 2));
56741 }
56742 }
56743 [[fallthrough]];
56744 case X86ISD::BLENDV:
56745 if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
56746 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
56747 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
56748 EVT SelVT = Ops[0].getOperand(0).getValueType();
56749 SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56750 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56751 return DAG.getNode(Op0.getOpcode(), DL, VT,
56752 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56753 ConcatSubOperand(VT, Ops, 1),
56754 ConcatSubOperand(VT, Ops, 2));
56755 }
56756 break;
56757 }
56758 }
56759
56760 // Fold subvector loads into one.
56761 // If needed, look through bitcasts to get to the load.
56762 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
56763 unsigned Fast;
56764 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
56765 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
56766 *FirstLd->getMemOperand(), &Fast) &&
56767 Fast) {
56768 if (SDValue Ld =
56769 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
56770 return Ld;
56771 }
56772 }
56773
56774 // Attempt to fold target constant loads.
56775 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
56776 SmallVector<APInt> EltBits;
56777 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
56778 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56779 APInt OpUndefElts;
56780 SmallVector<APInt> OpEltBits;
56781 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
56782 OpEltBits, true, false))
56783 break;
56784 EltBits.append(OpEltBits);
56785 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
56786 }
56787 if (EltBits.size() == VT.getVectorNumElements())
56788 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
56789 }
56790
56791 return SDValue();
56792}
56793
56794static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
56795 TargetLowering::DAGCombinerInfo &DCI,
56796 const X86Subtarget &Subtarget) {
56797 EVT VT = N->getValueType(0);
56798 EVT SrcVT = N->getOperand(0).getValueType();
56799 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56800 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56801
56802 if (VT.getVectorElementType() == MVT::i1) {
56803 // Attempt to constant fold.
56804 unsigned SubSizeInBits = SrcVT.getSizeInBits();
56805 APInt Constant = APInt::getZero(VT.getSizeInBits());
56806 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56807 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
56808 if (!C) break;
56809 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56810 if (I == (E - 1)) {
56811 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
56812 if (TLI.isTypeLegal(IntVT))
56813 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
56814 }
56815 }
56816
56817 // Don't do anything else for i1 vectors.
56818 return SDValue();
56819 }
56820
56821 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
56822 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
56823 DCI, Subtarget))
56824 return R;
56825 }
56826
56827 return SDValue();
56828}
56829
56830static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
56831 TargetLowering::DAGCombinerInfo &DCI,
56832 const X86Subtarget &Subtarget) {
56833 if (DCI.isBeforeLegalizeOps())
56834 return SDValue();
56835
56836 MVT OpVT = N->getSimpleValueType(0);
56837
56838 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
56839
56840 SDLoc dl(N);
56841 SDValue Vec = N->getOperand(0);
56842 SDValue SubVec = N->getOperand(1);
56843
56844 uint64_t IdxVal = N->getConstantOperandVal(2);
56845 MVT SubVecVT = SubVec.getSimpleValueType();
56846
56847 if (Vec.isUndef() && SubVec.isUndef())
56848 return DAG.getUNDEF(OpVT);
56849
56850 // Inserting undefs/zeros into zeros/undefs is a zero vector.
56851 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
56852 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
56853 return getZeroVector(OpVT, Subtarget, DAG, dl);
56854
56855 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
56856 // If we're inserting into a zero vector and then into a larger zero vector,
56857 // just insert into the larger zero vector directly.
56858 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56859 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
56860 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
56861 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56862 getZeroVector(OpVT, Subtarget, DAG, dl),
56863 SubVec.getOperand(1),
56864 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
56865 }
56866
56867 // If we're inserting into a zero vector and our input was extracted from an
56868 // insert into a zero vector of the same type and the extraction was at
56869 // least as large as the original insertion. Just insert the original
56870 // subvector into a zero vector.
56871 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
56872 isNullConstant(SubVec.getOperand(1)) &&
56873 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
56874 SDValue Ins = SubVec.getOperand(0);
56875 if (isNullConstant(Ins.getOperand(2)) &&
56876 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
56877 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
56878 SubVecVT.getFixedSizeInBits())
56879 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56880 getZeroVector(OpVT, Subtarget, DAG, dl),
56881 Ins.getOperand(1), N->getOperand(2));
56882 }
56883 }
56884
56885 // Stop here if this is an i1 vector.
56886 if (IsI1Vector)
56887 return SDValue();
56888
56889 // Eliminate an intermediate vector widening:
56890 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56891 // insert_subvector X, Y, Idx
56892 // TODO: This is a more general version of a DAGCombiner fold, can we move it
56893 // there?
56894 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56895 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
56896 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
56897 SubVec.getOperand(1), N->getOperand(2));
56898
56899 // If this is an insert of an extract, combine to a shuffle. Don't do this
56900 // if the insert or extract can be represented with a subregister operation.
56901 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56902 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
56903 (IdxVal != 0 ||
56904 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
56905 int ExtIdxVal = SubVec.getConstantOperandVal(1);
56906 if (ExtIdxVal != 0) {
56907 int VecNumElts = OpVT.getVectorNumElements();
56908 int SubVecNumElts = SubVecVT.getVectorNumElements();
56909 SmallVector<int, 64> Mask(VecNumElts);
56910 // First create an identity shuffle mask.
56911 for (int i = 0; i != VecNumElts; ++i)
56912 Mask[i] = i;
56913 // Now insert the extracted portion.
56914 for (int i = 0; i != SubVecNumElts; ++i)
56915 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
56916
56917 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
56918 }
56919 }
56920
56921 // Match concat_vector style patterns.
56922 SmallVector<SDValue, 2> SubVectorOps;
56923 if (collectConcatOps(N, SubVectorOps, DAG)) {
56924 if (SDValue Fold =
56925 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
56926 return Fold;
56927
56928 // If we're inserting all zeros into the upper half, change this to
56929 // a concat with zero. We will match this to a move
56930 // with implicit upper bit zeroing during isel.
56931 // We do this here because we don't want combineConcatVectorOps to
56932 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
56933 if (SubVectorOps.size() == 2 &&
56934 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
56935 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56936 getZeroVector(OpVT, Subtarget, DAG, dl),
56937 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56938 }
56939
56940 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56941 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56942 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56943
56944 // If this is a broadcast load inserted into an upper undef, use a larger
56945 // broadcast load.
56946 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56947 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56948 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56949 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56950 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56951 SDValue BcastLd =
56952 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
56953 MemIntr->getMemoryVT(),
56954 MemIntr->getMemOperand());
56955 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56956 return BcastLd;
56957 }
56958
56959 // If we're splatting the lower half subvector of a full vector load into the
56960 // upper half, attempt to create a subvector broadcast.
56961 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56962 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56963 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56964 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56965 if (VecLd && SubLd &&
56966 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56967 SubVec.getValueSizeInBits() / 8, 0))
56968 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56969 SubLd, 0, DAG);
56970 }
56971
56972 return SDValue();
56973}
56974
56975/// If we are extracting a subvector of a vector select and the select condition
56976/// is composed of concatenated vectors, try to narrow the select width. This
56977/// is a common pattern for AVX1 integer code because 256-bit selects may be
56978/// legal, but there is almost no integer math/logic available for 256-bit.
56979/// This function should only be called with legal types (otherwise, the calls
56980/// to get simple value types will assert).
56981static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
56982 SDValue Sel = Ext->getOperand(0);
56983 SmallVector<SDValue, 4> CatOps;
56984 if (Sel.getOpcode() != ISD::VSELECT ||
56985 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
56986 return SDValue();
56987
56988 // Note: We assume simple value types because this should only be called with
56989 // legal operations/types.
56990 // TODO: This can be extended to handle extraction to 256-bits.
56991 MVT VT = Ext->getSimpleValueType(0);
56992 if (!VT.is128BitVector())
56993 return SDValue();
56994
56995 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56996 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56997 return SDValue();
56998
56999 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
57000 MVT SelVT = Sel.getSimpleValueType();
57001 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__
__PRETTY_FUNCTION__))
57002 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__
__PRETTY_FUNCTION__))
;
57003
57004 unsigned SelElts = SelVT.getVectorNumElements();
57005 unsigned CastedElts = WideVT.getVectorNumElements();
57006 unsigned ExtIdx = Ext->getConstantOperandVal(1);
57007 if (SelElts % CastedElts == 0) {
57008 // The select has the same or more (narrower) elements than the extract
57009 // operand. The extraction index gets scaled by that factor.
57010 ExtIdx *= (SelElts / CastedElts);
57011 } else if (CastedElts % SelElts == 0) {
57012 // The select has less (wider) elements than the extract operand. Make sure
57013 // that the extraction index can be divided evenly.
57014 unsigned IndexDivisor = CastedElts / SelElts;
57015 if (ExtIdx % IndexDivisor != 0)
57016 return SDValue();
57017 ExtIdx /= IndexDivisor;
57018 } else {
57019 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57019)
;
57020 }
57021
57022 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
57023 unsigned NarrowElts = SelElts / NarrowingFactor;
57024 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
57025 SDLoc DL(Ext);
57026 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
57027 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
57028 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
57029 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
57030 return DAG.getBitcast(VT, NarrowSel);
57031}
57032
57033static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
57034 TargetLowering::DAGCombinerInfo &DCI,
57035 const X86Subtarget &Subtarget) {
57036 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
57037 // eventually get combined/lowered into ANDNP) with a concatenated operand,
57038 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
57039 // We let generic combining take over from there to simplify the
57040 // insert/extract and 'not'.
57041 // This pattern emerges during AVX1 legalization. We handle it before lowering
57042 // to avoid complications like splitting constant vector loads.
57043
57044 // Capture the original wide type in the likely case that we need to bitcast
57045 // back to this type.
57046 if (!N->getValueType(0).isSimple())
57047 return SDValue();
57048
57049 MVT VT = N->getSimpleValueType(0);
57050 SDValue InVec = N->getOperand(0);
57051 unsigned IdxVal = N->getConstantOperandVal(1);
57052 SDValue InVecBC = peekThroughBitcasts(InVec);
57053 EVT InVecVT = InVec.getValueType();
57054 unsigned SizeInBits = VT.getSizeInBits();
57055 unsigned InSizeInBits = InVecVT.getSizeInBits();
57056 unsigned NumSubElts = VT.getVectorNumElements();
57057 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57058
57059 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
57060 TLI.isTypeLegal(InVecVT) &&
57061 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
57062 auto isConcatenatedNot = [](SDValue V) {
57063 V = peekThroughBitcasts(V);
57064 if (!isBitwiseNot(V))
57065 return false;
57066 SDValue NotOp = V->getOperand(0);
57067 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
57068 };
57069 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
57070 isConcatenatedNot(InVecBC.getOperand(1))) {
57071 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
57072 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
57073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
57074 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
57075 }
57076 }
57077
57078 if (DCI.isBeforeLegalizeOps())
57079 return SDValue();
57080
57081 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
57082 return V;
57083
57084 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
57085 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
57086
57087 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
57088 if (VT.getScalarType() == MVT::i1)
57089 return DAG.getConstant(1, SDLoc(N), VT);
57090 return getOnesVector(VT, DAG, SDLoc(N));
57091 }
57092
57093 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
57094 return DAG.getBuildVector(VT, SDLoc(N),
57095 InVec->ops().slice(IdxVal, NumSubElts));
57096
57097 // If we are extracting from an insert into a larger vector, replace with a
57098 // smaller insert if we don't access less than the original subvector. Don't
57099 // do this for i1 vectors.
57100 // TODO: Relax the matching indices requirement?
57101 if (VT.getVectorElementType() != MVT::i1 &&
57102 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
57103 IdxVal == InVec.getConstantOperandVal(2) &&
57104 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
57105 SDLoc DL(N);
57106 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
57107 InVec.getOperand(0), N->getOperand(1));
57108 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
57109 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
57110 InVec.getOperand(1),
57111 DAG.getVectorIdxConstant(NewIdxVal, DL));
57112 }
57113
57114 // If we're extracting an upper subvector from a broadcast we should just
57115 // extract the lowest subvector instead which should allow
57116 // SimplifyDemandedVectorElts do more simplifications.
57117 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
57118 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
57119 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
57120 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
57121
57122 // If we're extracting a broadcasted subvector, just use the lowest subvector.
57123 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
57124 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
57125 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
57126
57127 // Attempt to extract from the source of a shuffle vector.
57128 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
57129 SmallVector<int, 32> ShuffleMask;
57130 SmallVector<int, 32> ScaledMask;
57131 SmallVector<SDValue, 2> ShuffleInputs;
57132 unsigned NumSubVecs = InSizeInBits / SizeInBits;
57133 // Decode the shuffle mask and scale it so its shuffling subvectors.
57134 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
57135 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
57136 unsigned SubVecIdx = IdxVal / NumSubElts;
57137 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
57138 return DAG.getUNDEF(VT);
57139 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
57140 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
57141 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
57142 if (Src.getValueSizeInBits() == InSizeInBits) {
57143 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
57144 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
57145 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
57146 SDLoc(N), SizeInBits);
57147 }
57148 }
57149 }
57150
57151 // If we're extracting the lowest subvector and we're the only user,
57152 // we may be able to perform this with a smaller vector width.
57153 unsigned InOpcode = InVec.getOpcode();
57154 if (InVec.hasOneUse()) {
57155 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
57156 // v2f64 CVTDQ2PD(v4i32).
57157 if (InOpcode == ISD::SINT_TO_FP &&
57158 InVec.getOperand(0).getValueType() == MVT::v4i32) {
57159 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
57160 }
57161 // v2f64 CVTUDQ2PD(v4i32).
57162 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
57163 InVec.getOperand(0).getValueType() == MVT::v4i32) {
57164 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
57165 }
57166 // v2f64 CVTPS2PD(v4f32).
57167 if (InOpcode == ISD::FP_EXTEND &&
57168 InVec.getOperand(0).getValueType() == MVT::v4f32) {
57169 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
57170 }
57171 }
57172 if (IdxVal == 0 &&
57173 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
57174 (SizeInBits == 128 || SizeInBits == 256) &&
57175 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
57176 SDLoc DL(N);
57177 SDValue Ext = InVec.getOperand(0);
57178 if (Ext.getValueSizeInBits() > SizeInBits)
57179 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
57180 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
57181 return DAG.getNode(ExtOp, DL, VT, Ext);
57182 }
57183 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
57184 InVec.getOperand(0).getValueType().is256BitVector() &&
57185 InVec.getOperand(1).getValueType().is256BitVector() &&
57186 InVec.getOperand(2).getValueType().is256BitVector()) {
57187 SDLoc DL(N);
57188 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
57189 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
57190 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
57191 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
57192 }
57193 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
57194 (VT.is128BitVector() || VT.is256BitVector())) {
57195 SDLoc DL(N);
57196 SDValue InVecSrc = InVec.getOperand(0);
57197 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
57198 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
57199 return DAG.getNode(InOpcode, DL, VT, Ext);
57200 }
57201 if (InOpcode == X86ISD::MOVDDUP &&
57202 (VT.is128BitVector() || VT.is256BitVector())) {
57203 SDLoc DL(N);
57204 SDValue Ext0 =
57205 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57206 return DAG.getNode(InOpcode, DL, VT, Ext0);
57207 }
57208 }
57209
57210 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
57211 // as this is very likely to fold into a shuffle/truncation.
57212 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
57213 InVecVT.getScalarSizeInBits() == 64 &&
57214 InVec.getConstantOperandAPInt(1) == 32) {
57215 SDLoc DL(N);
57216 SDValue Ext =
57217 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57218 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
57219 }
57220
57221 return SDValue();
57222}
57223
57224static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
57225 EVT VT = N->getValueType(0);
57226 SDValue Src = N->getOperand(0);
57227 SDLoc DL(N);
57228
57229 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
57230 // This occurs frequently in our masked scalar intrinsic code and our
57231 // floating point select lowering with AVX512.
57232 // TODO: SimplifyDemandedBits instead?
57233 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
57234 isOneConstant(Src.getOperand(1)))
57235 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
57236
57237 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
57238 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57239 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
57240 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
57241 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
57242 if (C->isZero())
57243 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
57244 Src.getOperand(1));
57245
57246 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
57247 // TODO: Move to DAGCombine/SimplifyDemandedBits?
57248 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
57249 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
57250 if (Op.getValueType() != MVT::i64)
57251 return SDValue();
57252 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
57253 if (Op.getOpcode() == Opc &&
57254 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
57255 return Op.getOperand(0);
57256 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
57257 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
57258 if (Ld->getExtensionType() == Ext &&
57259 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
57260 return Op;
57261 if (IsZeroExt) {
57262 KnownBits Known = DAG.computeKnownBits(Op);
57263 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
57264 return Op;
57265 }
57266 return SDValue();
57267 };
57268
57269 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
57270 return DAG.getBitcast(
57271 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57272 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
57273
57274 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
57275 return DAG.getBitcast(
57276 VT,
57277 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
57278 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57279 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
57280 }
57281
57282 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
57283 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
57284 Src.getOperand(0).getValueType() == MVT::x86mmx)
57285 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
57286
57287 // See if we're broadcasting the scalar value, in which case just reuse that.
57288 // Ensure the same SDValue from the SDNode use is being used.
57289 if (VT.getScalarType() == Src.getValueType())
57290 for (SDNode *User : Src->uses())
57291 if (User->getOpcode() == X86ISD::VBROADCAST &&
57292 Src == User->getOperand(0)) {
57293 unsigned SizeInBits = VT.getFixedSizeInBits();
57294 unsigned BroadcastSizeInBits =
57295 User->getValueSizeInBits(0).getFixedValue();
57296 if (BroadcastSizeInBits == SizeInBits)
57297 return SDValue(User, 0);
57298 if (BroadcastSizeInBits > SizeInBits)
57299 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
57300 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
57301 // coverage.
57302 }
57303
57304 return SDValue();
57305}
57306
57307// Simplify PMULDQ and PMULUDQ operations.
57308static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
57309 TargetLowering::DAGCombinerInfo &DCI,
57310 const X86Subtarget &Subtarget) {
57311 SDValue LHS = N->getOperand(0);
57312 SDValue RHS = N->getOperand(1);
57313
57314 // Canonicalize constant to RHS.
57315 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
57316 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
57317 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57318
57319 // Multiply by zero.
57320 // Don't return RHS as it may contain UNDEFs.
57321 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
57322 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57323
57324 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
57325 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57326 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
57327 return SDValue(N, 0);
57328
57329 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
57330 // convert it to any_extend_invec, due to the LegalOperations check, do the
57331 // conversion directly to a vector shuffle manually. This exposes combine
57332 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
57333 // combineX86ShufflesRecursively on SSE4.1 targets.
57334 // FIXME: This is basically a hack around several other issues related to
57335 // ANY_EXTEND_VECTOR_INREG.
57336 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57337 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57338 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57339 LHS.getOperand(0).getValueType() == MVT::v4i32) {
57340 SDLoc dl(N);
57341 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
57342 LHS.getOperand(0), { 0, -1, 1, -1 });
57343 LHS = DAG.getBitcast(MVT::v2i64, LHS);
57344 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57345 }
57346 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57347 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57348 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57349 RHS.getOperand(0).getValueType() == MVT::v4i32) {
57350 SDLoc dl(N);
57351 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
57352 RHS.getOperand(0), { 0, -1, 1, -1 });
57353 RHS = DAG.getBitcast(MVT::v2i64, RHS);
57354 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57355 }
57356
57357 return SDValue();
57358}
57359
57360// Simplify VPMADDUBSW/VPMADDWD operations.
57361static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
57362 TargetLowering::DAGCombinerInfo &DCI) {
57363 EVT VT = N->getValueType(0);
57364 SDValue LHS = N->getOperand(0);
57365 SDValue RHS = N->getOperand(1);
57366
57367 // Multiply by zero.
57368 // Don't return LHS/RHS as it may contain UNDEFs.
57369 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
57370 ISD::isBuildVectorAllZeros(RHS.getNode()))
57371 return DAG.getConstant(0, SDLoc(N), VT);
57372
57373 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57374 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57375 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57376 return SDValue(N, 0);
57377
57378 return SDValue();
57379}
57380
57381static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
57382 TargetLowering::DAGCombinerInfo &DCI,
57383 const X86Subtarget &Subtarget) {
57384 EVT VT = N->getValueType(0);
57385 SDValue In = N->getOperand(0);
57386 unsigned Opcode = N->getOpcode();
57387 unsigned InOpcode = In.getOpcode();
57388 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57389 SDLoc DL(N);
57390
57391 // Try to merge vector loads and extend_inreg to an extload.
57392 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
57393 In.hasOneUse()) {
57394 auto *Ld = cast<LoadSDNode>(In);
57395 if (Ld->isSimple()) {
57396 MVT SVT = In.getSimpleValueType().getVectorElementType();
57397 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
57398 ? ISD::SEXTLOAD
57399 : ISD::ZEXTLOAD;
57400 EVT MemVT = VT.changeVectorElementType(SVT);
57401 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
57402 SDValue Load = DAG.getExtLoad(
57403 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57404 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57405 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
57406 return Load;
57407 }
57408 }
57409 }
57410
57411 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57412 if (Opcode == InOpcode)
57413 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
57414
57415 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
57416 // -> EXTEND_VECTOR_INREG(X).
57417 // TODO: Handle non-zero subvector indices.
57418 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
57419 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
57420 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
57421 In.getValueSizeInBits())
57422 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
57423
57424 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57425 // TODO: Move to DAGCombine?
57426 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
57427 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
57428 In.getValueSizeInBits() == VT.getSizeInBits()) {
57429 unsigned NumElts = VT.getVectorNumElements();
57430 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
57431 EVT EltVT = In.getOperand(0).getValueType();
57432 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
57433 for (unsigned I = 0; I != NumElts; ++I)
57434 Elts[I * Scale] = In.getOperand(I);
57435 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
57436 }
57437
57438 // Attempt to combine as a shuffle on SSE41+ targets.
57439 if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
57440 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
57441 Subtarget.hasSSE41()) {
57442 SDValue Op(N, 0);
57443 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
57444 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
57445 return Res;
57446 }
57447
57448 return SDValue();
57449}
57450
57451static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
57452 TargetLowering::DAGCombinerInfo &DCI) {
57453 EVT VT = N->getValueType(0);
57454
57455 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57456 return DAG.getConstant(0, SDLoc(N), VT);
57457
57458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57459 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57460 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57461 return SDValue(N, 0);
57462
57463 return SDValue();
57464}
57465
57466// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
57467// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
57468// extra instructions between the conversion due to going to scalar and back.
57469static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
57470 const X86Subtarget &Subtarget) {
57471 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
57472 return SDValue();
57473
57474 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57475 return SDValue();
57476
57477 if (N->getValueType(0) != MVT::f32 ||
57478 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57479 return SDValue();
57480
57481 SDLoc dl(N);
57482 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
57483 N->getOperand(0).getOperand(0));
57484 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
57485 DAG.getTargetConstant(4, dl, MVT::i32));
57486 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
57487 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
57488 DAG.getIntPtrConstant(0, dl));
57489}
57490
57491static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
57492 const X86Subtarget &Subtarget) {
57493 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57494 return SDValue();
57495
57496 if (Subtarget.hasFP16())
57497 return SDValue();
57498
57499 bool IsStrict = N->isStrictFPOpcode();
57500 EVT VT = N->getValueType(0);
57501 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57502 EVT SrcVT = Src.getValueType();
57503
57504 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
57505 return SDValue();
57506
57507 if (VT.getVectorElementType() != MVT::f32 &&
57508 VT.getVectorElementType() != MVT::f64)
57509 return SDValue();
57510
57511 unsigned NumElts = VT.getVectorNumElements();
57512 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57513 return SDValue();
57514
57515 SDLoc dl(N);
57516
57517 // Convert the input to vXi16.
57518 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
57519 Src = DAG.getBitcast(IntVT, Src);
57520
57521 // Widen to at least 8 input elements.
57522 if (NumElts < 8) {
57523 unsigned NumConcats = 8 / NumElts;
57524 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
57525 : DAG.getConstant(0, dl, IntVT);
57526 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
57527 Ops[0] = Src;
57528 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
57529 }
57530
57531 // Destination is vXf32 with at least 4 elements.
57532 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
57533 std::max(4U, NumElts));
57534 SDValue Cvt, Chain;
57535 if (IsStrict) {
57536 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
57537 {N->getOperand(0), Src});
57538 Chain = Cvt.getValue(1);
57539 } else {
57540 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
57541 }
57542
57543 if (NumElts < 4) {
57544 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57544, __extension__
__PRETTY_FUNCTION__))
;
57545 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
57546 DAG.getIntPtrConstant(0, dl));
57547 }
57548
57549 if (IsStrict) {
57550 // Extend to the original VT if necessary.
57551 if (Cvt.getValueType() != VT) {
57552 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
57553 {Chain, Cvt});
57554 Chain = Cvt.getValue(1);
57555 }
57556 return DAG.getMergeValues({Cvt, Chain}, dl);
57557 }
57558
57559 // Extend to the original VT if necessary.
57560 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
57561}
57562
57563// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
57564// from. Limit this to cases where the loads have the same input chain and the
57565// output chains are unused. This avoids any memory ordering issues.
57566static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
57567 TargetLowering::DAGCombinerInfo &DCI) {
57568 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))
57569 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))
57570 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))
;
57571
57572 // Only do this if the chain result is unused.
57573 if (N->hasAnyUseOfValue(1))
57574 return SDValue();
57575
57576 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
57577
57578 SDValue Ptr = MemIntrin->getBasePtr();
57579 SDValue Chain = MemIntrin->getChain();
57580 EVT VT = N->getSimpleValueType(0);
57581 EVT MemVT = MemIntrin->getMemoryVT();
57582
57583 // Look at other users of our base pointer and try to find a wider broadcast.
57584 // The input chain and the size of the memory VT must match.
57585 for (SDNode *User : Ptr->uses())
57586 if (User != N && User->getOpcode() == N->getOpcode() &&
57587 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57588 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57589 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57590 MemVT.getSizeInBits() &&
57591 !User->hasAnyUseOfValue(1) &&
57592 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57593 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
57594 VT.getSizeInBits());
57595 Extract = DAG.getBitcast(VT, Extract);
57596 return DCI.CombineTo(N, Extract, SDValue(User, 1));
57597 }
57598
57599 return SDValue();
57600}
57601
57602static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
57603 const X86Subtarget &Subtarget) {
57604 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57605 return SDValue();
57606
57607 bool IsStrict = N->isStrictFPOpcode();
57608 EVT VT = N->getValueType(0);
57609 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57610 EVT SrcVT = Src.getValueType();
57611
57612 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
57613 SrcVT.getVectorElementType() != MVT::f32)
57614 return SDValue();
57615
57616 SDLoc dl(N);
57617
57618 SDValue Cvt, Chain;
57619 unsigned NumElts = VT.getVectorNumElements();
57620 if (Subtarget.hasFP16()) {
57621 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
57622 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
57623 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
57624 SDValue Cvt0, Cvt1;
57625 SDValue Op0 = Src.getOperand(0);
57626 SDValue Op1 = Src.getOperand(1);
57627 bool IsOp0Strict = Op0->isStrictFPOpcode();
57628 if (Op0.getOpcode() != Op1.getOpcode() ||
57629 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
57630 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
57631 return SDValue();
57632 }
57633 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
57634 if (IsStrict) {
57635 assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57635, __extension__
__PRETTY_FUNCTION__))
;
57636 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
57637 ? X86ISD::STRICT_CVTSI2P
57638 : X86ISD::STRICT_CVTUI2P;
57639 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57640 {Op0.getOperand(0), Op0.getOperand(1)});
57641 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57642 {Op1.getOperand(0), Op1.getOperand(1)});
57643 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57644 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
57645 }
57646 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
57647 : X86ISD::CVTUI2P;
57648 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
57649 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
57650 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57651 }
57652 return SDValue();
57653 }
57654
57655 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57656 return SDValue();
57657
57658 // Widen to at least 4 input elements.
57659 if (NumElts < 4)
57660 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
57661 DAG.getConstantFP(0.0, dl, SrcVT));
57662
57663 // Destination is v8i16 with at least 8 elements.
57664 EVT CvtVT =
57665 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
57666 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
57667 if (IsStrict) {
57668 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
57669 {N->getOperand(0), Src, Rnd});
57670 Chain = Cvt.getValue(1);
57671 } else {
57672 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
57673 }
57674
57675 // Extract down to real number of elements.
57676 if (NumElts < 8) {
57677 EVT IntVT = VT.changeVectorElementTypeToInteger();
57678 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
57679 DAG.getIntPtrConstant(0, dl));
57680 }
57681
57682 Cvt = DAG.getBitcast(VT, Cvt);
57683
57684 if (IsStrict)
57685 return DAG.getMergeValues({Cvt, Chain}, dl);
57686
57687 return Cvt;
57688}
57689
57690static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
57691 SDValue Src = N->getOperand(0);
57692
57693 // Turn MOVDQ2Q+simple_load into an mmx load.
57694 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
57695 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
57696
57697 if (LN->isSimple()) {
57698 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
57699 LN->getBasePtr(),
57700 LN->getPointerInfo(),
57701 LN->getOriginalAlign(),
57702 LN->getMemOperand()->getFlags());
57703 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
57704 return NewLd;
57705 }
57706 }
57707
57708 return SDValue();
57709}
57710
57711static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
57712 TargetLowering::DAGCombinerInfo &DCI) {
57713 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
57714 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57715 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
57716 return SDValue(N, 0);
57717
57718 return SDValue();
57719}
57720
57721SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
57722 DAGCombinerInfo &DCI) const {
57723 SelectionDAG &DAG = DCI.DAG;
57724 switch (N->getOpcode()) {
57725 default: break;
57726 case ISD::SCALAR_TO_VECTOR:
57727 return combineScalarToVector(N, DAG);
57728 case ISD::EXTRACT_VECTOR_ELT:
57729 case X86ISD::PEXTRW:
57730 case X86ISD::PEXTRB:
57731 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
57732 case ISD::CONCAT_VECTORS:
57733 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
57734 case ISD::INSERT_SUBVECTOR:
57735 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
57736 case ISD::EXTRACT_SUBVECTOR:
57737 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
57738 case ISD::VSELECT:
57739 case ISD::SELECT:
57740 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
57741 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
57742 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
57743 case X86ISD::CMP: return combineCMP(N, DAG);
57744 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
57745 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
57746 case X86ISD::ADD:
57747 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
57748 case X86ISD::SBB: return combineSBB(N, DAG);
57749 case X86ISD::ADC: return combineADC(N, DAG, DCI);
57750 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
57751 case ISD::SHL: return combineShiftLeft(N, DAG);
57752 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
57753 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
57754 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
57755 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
57756 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
57757 case X86ISD::BEXTR:
57758 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
57759 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
57760 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
57761 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
57762 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
57763 case X86ISD::VEXTRACT_STORE:
57764 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
57765 case ISD::SINT_TO_FP:
57766 case ISD::STRICT_SINT_TO_FP:
57767 return combineSIntToFP(N, DAG, DCI, Subtarget);
57768 case ISD::UINT_TO_FP:
57769 case ISD::STRICT_UINT_TO_FP:
57770 return combineUIntToFP(N, DAG, Subtarget);
57771 case ISD::FADD:
57772 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
57773 case X86ISD::VFCMULC:
57774 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
57775 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
57776 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
57777 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
57778 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
57779 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
57780 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
57781 case X86ISD::FXOR:
57782 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
57783 case X86ISD::FMIN:
57784 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
57785 case ISD::FMINNUM:
57786 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
57787 case X86ISD::CVTSI2P:
57788 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
57789 case X86ISD::CVTP2SI:
57790 case X86ISD::CVTP2UI:
57791 case X86ISD::STRICT_CVTTP2SI:
57792 case X86ISD::CVTTP2SI:
57793 case X86ISD::STRICT_CVTTP2UI:
57794 case X86ISD::CVTTP2UI:
57795 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
57796 case X86ISD::STRICT_CVTPH2PS:
57797 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
57798 case X86ISD::BT: return combineBT(N, DAG, DCI);
57799 case ISD::ANY_EXTEND:
57800 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
57801 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
57802 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
57803 case ISD::ANY_EXTEND_VECTOR_INREG:
57804 case ISD::SIGN_EXTEND_VECTOR_INREG:
57805 case ISD::ZERO_EXTEND_VECTOR_INREG:
57806 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
57807 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
57808 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
57809 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
57810 case X86ISD::PACKSS:
57811 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
57812 case X86ISD::HADD:
57813 case X86ISD::HSUB:
57814 case X86ISD::FHADD:
57815 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
57816 case X86ISD::VSHL:
57817 case X86ISD::VSRA:
57818 case X86ISD::VSRL:
57819 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
57820 case X86ISD::VSHLI:
57821 case X86ISD::VSRAI:
57822 case X86ISD::VSRLI:
57823 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
57824 case ISD::INSERT_VECTOR_ELT:
57825 case X86ISD::PINSRB:
57826 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
57827 case X86ISD::SHUFP: // Handle all target specific shuffles
57828 case X86ISD::INSERTPS:
57829 case X86ISD::EXTRQI:
57830 case X86ISD::INSERTQI:
57831 case X86ISD::VALIGN:
57832 case X86ISD::PALIGNR:
57833 case X86ISD::VSHLDQ:
57834 case X86ISD::VSRLDQ:
57835 case X86ISD::BLENDI:
57836 case X86ISD::UNPCKH:
57837 case X86ISD::UNPCKL:
57838 case X86ISD::MOVHLPS:
57839 case X86ISD::MOVLHPS:
57840 case X86ISD::PSHUFB:
57841 case X86ISD::PSHUFD:
57842 case X86ISD::PSHUFHW:
57843 case X86ISD::PSHUFLW:
57844 case X86ISD::MOVSHDUP:
57845 case X86ISD::MOVSLDUP:
57846 case X86ISD::MOVDDUP:
57847 case X86ISD::MOVSS:
57848 case X86ISD::MOVSD:
57849 case X86ISD::MOVSH:
57850 case X86ISD::VBROADCAST:
57851 case X86ISD::VPPERM:
57852 case X86ISD::VPERMI:
57853 case X86ISD::VPERMV:
57854 case X86ISD::VPERMV3:
57855 case X86ISD::VPERMIL2:
57856 case X86ISD::VPERMILPI:
57857 case X86ISD::VPERMILPV:
57858 case X86ISD::VPERM2X128:
57859 case X86ISD::SHUF128:
57860 case X86ISD::VZEXT_MOVL:
57861 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
57862 case X86ISD::FMADD_RND:
57863 case X86ISD::FMSUB:
57864 case X86ISD::STRICT_FMSUB:
57865 case X86ISD::FMSUB_RND:
57866 case X86ISD::FNMADD:
57867 case X86ISD::STRICT_FNMADD:
57868 case X86ISD::FNMADD_RND:
57869 case X86ISD::FNMSUB:
57870 case X86ISD::STRICT_FNMSUB:
57871 case X86ISD::FNMSUB_RND:
57872 case ISD::FMA:
57873 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
57874 case X86ISD::FMADDSUB_RND:
57875 case X86ISD::FMSUBADD_RND:
57876 case X86ISD::FMADDSUB:
57877 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
57878 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
57879 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
57880 case X86ISD::MGATHER:
57881 case X86ISD::MSCATTER:
57882 return combineX86GatherScatter(N, DAG, DCI, Subtarget);
57883 case ISD::MGATHER:
57884 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57885 case X86ISD::PCMPEQ:
57886 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57887 case X86ISD::PMULDQ:
57888 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57889 case X86ISD::VPMADDUBSW:
57890 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57891 case X86ISD::KSHIFTL:
57892 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57893 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57894 case ISD::STRICT_FP_EXTEND:
57895 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
57896 case ISD::STRICT_FP_ROUND:
57897 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57898 case X86ISD::VBROADCAST_LOAD:
57899 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57900 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57901 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57902 }
57903
57904 return SDValue();
57905}
57906
57907bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
57908 return false;
57909}
57910
57911bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57912 if (!isTypeLegal(VT))
57913 return false;
57914
57915 // There are no vXi8 shifts.
57916 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57917 return false;
57918
57919 // TODO: Almost no 8-bit ops are desirable because they have no actual
57920 // size/speed advantages vs. 32-bit ops, but they do have a major
57921 // potential disadvantage by causing partial register stalls.
57922 //
57923 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57924 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57925 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57926 // check for a constant operand to the multiply.
57927 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57928 return false;
57929
57930 // i16 instruction encodings are longer and some i16 instructions are slow,
57931 // so those are not desirable.
57932 if (VT == MVT::i16) {
57933 switch (Opc) {
57934 default:
57935 break;
57936 case ISD::LOAD:
57937 case ISD::SIGN_EXTEND:
57938 case ISD::ZERO_EXTEND:
57939 case ISD::ANY_EXTEND:
57940 case ISD::SHL:
57941 case ISD::SRA:
57942 case ISD::SRL:
57943 case ISD::SUB:
57944 case ISD::ADD:
57945 case ISD::MUL:
57946 case ISD::AND:
57947 case ISD::OR:
57948 case ISD::XOR:
57949 return false;
57950 }
57951 }
57952
57953 // Any legal type not explicitly accounted for above here is desirable.
57954 return true;
57955}
57956
57957SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
57958 SDValue Value, SDValue Addr,
57959 SelectionDAG &DAG) const {
57960 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57961 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57962 if (IsCFProtectionSupported) {
57963 // In case control-flow branch protection is enabled, we need to add
57964 // notrack prefix to the indirect branch.
57965 // In order to do that we create NT_BRIND SDNode.
57966 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57967 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
57968 }
57969
57970 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
57971}
57972
57973TargetLowering::AndOrSETCCFoldKind
57974X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
57975 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57976 using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
57977 EVT VT = LogicOp->getValueType(0);
57978 EVT OpVT = SETCC0->getOperand(0).getValueType();
57979 if (!VT.isInteger())
57980 return AndOrSETCCFoldKind::None;
57981
57982 if (VT.isVector())
57983 return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
57984 (isOperationLegal(ISD::ABS, OpVT)
57985 ? AndOrSETCCFoldKind::ABS
57986 : AndOrSETCCFoldKind::None));
57987
57988 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57989 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57990 // `NotAnd` applies, `AddAnd` does as well.
57991 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57992 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57993 return AndOrSETCCFoldKind::AddAnd;
57994}
57995
57996bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
57997 EVT VT = Op.getValueType();
57998 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57999 isa<ConstantSDNode>(Op.getOperand(1));
58000
58001 // i16 is legal, but undesirable since i16 instruction encodings are longer
58002 // and some i16 instructions are slow.
58003 // 8-bit multiply-by-constant can usually be expanded to something cheaper
58004 // using LEA and/or other ALU ops.
58005 if (VT != MVT::i16 && !Is8BitMulByConstant)
58006 return false;
58007
58008 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
58009 if (!Op.hasOneUse())
58010 return false;
58011 SDNode *User = *Op->use_begin();
58012 if (!ISD::isNormalStore(User))
58013 return false;
58014 auto *Ld = cast<LoadSDNode>(Load);
58015 auto *St = cast<StoreSDNode>(User);
58016 return Ld->getBasePtr() == St->getBasePtr();
58017 };
58018
58019 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
58020 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
58021 return false;
58022 if (!Op.hasOneUse())
58023 return false;
58024 SDNode *User = *Op->use_begin();
58025 if (User->getOpcode() != ISD::ATOMIC_STORE)
58026 return false;
58027 auto *Ld = cast<AtomicSDNode>(Load);
58028 auto *St = cast<AtomicSDNode>(User);
58029 return Ld->getBasePtr() == St->getBasePtr();
58030 };
58031
58032 bool Commute = false;
58033 switch (Op.getOpcode()) {
58034 default: return false;
58035 case ISD::SIGN_EXTEND:
58036 case ISD::ZERO_EXTEND:
58037 case ISD::ANY_EXTEND:
58038 break;
58039 case ISD::SHL:
58040 case ISD::SRA:
58041 case ISD::SRL: {
58042 SDValue N0 = Op.getOperand(0);
58043 // Look out for (store (shl (load), x)).
58044 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
58045 return false;
58046 break;
58047 }
58048 case ISD::ADD:
58049 case ISD::MUL:
58050 case ISD::AND:
58051 case ISD::OR:
58052 case ISD::XOR:
58053 Commute = true;
58054 [[fallthrough]];
58055 case ISD::SUB: {
58056 SDValue N0 = Op.getOperand(0);
58057 SDValue N1 = Op.getOperand(1);
58058 // Avoid disabling potential load folding opportunities.
58059 if (X86::mayFoldLoad(N1, Subtarget) &&
58060 (!Commute || !isa<ConstantSDNode>(N0) ||
58061 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
58062 return false;
58063 if (X86::mayFoldLoad(N0, Subtarget) &&
58064 ((Commute && !isa<ConstantSDNode>(N1)) ||
58065 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
58066 return false;
58067 if (IsFoldableAtomicRMW(N0, Op) ||
58068 (Commute && IsFoldableAtomicRMW(N1, Op)))
58069 return false;
58070 }
58071 }
58072
58073 PVT = MVT::i32;
58074 return true;
58075}
58076
58077//===----------------------------------------------------------------------===//
58078// X86 Inline Assembly Support
58079//===----------------------------------------------------------------------===//
58080
58081// Helper to match a string separated by whitespace.
58082static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
58083 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
58084
58085 for (StringRef Piece : Pieces) {
58086 if (!S.startswith(Piece)) // Check if the piece matches.
58087 return false;
58088
58089 S = S.substr(Piece.size());
58090 StringRef::size_type Pos = S.find_first_not_of(" \t");
58091 if (Pos == 0) // We matched a prefix.
58092 return false;
58093
58094 S = S.substr(Pos);
58095 }
58096
58097 return S.empty();
58098}
58099
58100static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
58101
58102 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
58103 if (llvm::is_contained(AsmPieces, "~{cc}") &&
58104 llvm::is_contained(AsmPieces, "~{flags}") &&
58105 llvm::is_contained(AsmPieces, "~{fpsr}")) {
58106
58107 if (AsmPieces.size() == 3)
58108 return true;
58109 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
58110 return true;
58111 }
58112 }
58113 return false;
58114}
58115
58116bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
58117 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
58118
58119 const std::string &AsmStr = IA->getAsmString();
58120
58121 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
58122 if (!Ty || Ty->getBitWidth() % 16 != 0)
58123 return false;
58124
58125 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
58126 SmallVector<StringRef, 4> AsmPieces;
58127 SplitString(AsmStr, AsmPieces, ";\n");
58128
58129 switch (AsmPieces.size()) {
58130 default: return false;
58131 case 1:
58132 // FIXME: this should verify that we are targeting a 486 or better. If not,
58133 // we will turn this bswap into something that will be lowered to logical
58134 // ops instead of emitting the bswap asm. For now, we don't support 486 or
58135 // lower so don't worry about this.
58136 // bswap $0
58137 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
58138 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
58139 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
58140 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
58141 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
58142 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
58143 // No need to check constraints, nothing other than the equivalent of
58144 // "=r,0" would be valid here.
58145 return IntrinsicLowering::LowerToByteSwap(CI);
58146 }
58147
58148 // rorw $$8, ${0:w} --> llvm.bswap.i16
58149 if (CI->getType()->isIntegerTy(16) &&
58150 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58151 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
58152 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
58153 AsmPieces.clear();
58154 StringRef ConstraintsStr = IA->getConstraintString();
58155 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58156 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58157 if (clobbersFlagRegisters(AsmPieces))
58158 return IntrinsicLowering::LowerToByteSwap(CI);
58159 }
58160 break;
58161 case 3:
58162 if (CI->getType()->isIntegerTy(32) &&
58163 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58164 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
58165 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
58166 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
58167 AsmPieces.clear();
58168 StringRef ConstraintsStr = IA->getConstraintString();
58169 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58170 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58171 if (clobbersFlagRegisters(AsmPieces))
58172 return IntrinsicLowering::LowerToByteSwap(CI);
58173 }
58174
58175 if (CI->getType()->isIntegerTy(64)) {
58176 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
58177 if (Constraints.size() >= 2 &&
58178 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
58179 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
58180 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
58181 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
58182 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
58183 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
58184 return IntrinsicLowering::LowerToByteSwap(CI);
58185 }
58186 }
58187 break;
58188 }
58189 return false;
58190}
58191
58192static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
58193 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
58194 .Case("{@cca}", X86::COND_A)
58195 .Case("{@ccae}", X86::COND_AE)
58196 .Case("{@ccb}", X86::COND_B)
58197 .Case("{@ccbe}", X86::COND_BE)
58198 .Case("{@ccc}", X86::COND_B)
58199 .Case("{@cce}", X86::COND_E)
58200 .Case("{@ccz}", X86::COND_E)
58201 .Case("{@ccg}", X86::COND_G)
58202 .Case("{@ccge}", X86::COND_GE)
58203 .Case("{@ccl}", X86::COND_L)
58204 .Case("{@ccle}", X86::COND_LE)
58205 .Case("{@ccna}", X86::COND_BE)
58206 .Case("{@ccnae}", X86::COND_B)
58207 .Case("{@ccnb}", X86::COND_AE)
58208 .Case("{@ccnbe}", X86::COND_A)
58209 .Case("{@ccnc}", X86::COND_AE)
58210 .Case("{@ccne}", X86::COND_NE)
58211 .Case("{@ccnz}", X86::COND_NE)
58212 .Case("{@ccng}", X86::COND_LE)
58213 .Case("{@ccnge}", X86::COND_L)
58214 .Case("{@ccnl}", X86::COND_GE)
58215 .Case("{@ccnle}", X86::COND_G)
58216 .Case("{@ccno}", X86::COND_NO)
58217 .Case("{@ccnp}", X86::COND_NP)
58218 .Case("{@ccns}", X86::COND_NS)
58219 .Case("{@cco}", X86::COND_O)
58220 .Case("{@ccp}", X86::COND_P)
58221 .Case("{@ccs}", X86::COND_S)
58222 .Default(X86::COND_INVALID);
58223 return Cond;
58224}
58225
58226/// Given a constraint letter, return the type of constraint for this target.
58227X86TargetLowering::ConstraintType
58228X86TargetLowering::getConstraintType(StringRef Constraint) const {
58229 if (Constraint.size() == 1) {
58230 switch (Constraint[0]) {
58231 case 'R':
58232 case 'q':
58233 case 'Q':
58234 case 'f':
58235 case 't':
58236 case 'u':
58237 case 'y':
58238 case 'x':
58239 case 'v':
58240 case 'l':
58241 case 'k': // AVX512 masking registers.
58242 return C_RegisterClass;
58243 case 'a':
58244 case 'b':
58245 case 'c':
58246 case 'd':
58247 case 'S':
58248 case 'D':
58249 case 'A':
58250 return C_Register;
58251 case 'I':
58252 case 'J':
58253 case 'K':
58254 case 'N':
58255 case 'G':
58256 case 'L':
58257 case 'M':
58258 return C_Immediate;
58259 case 'C':
58260 case 'e':
58261 case 'Z':
58262 return C_Other;
58263 default:
58264 break;
58265 }
58266 }
58267 else if (Constraint.size() == 2) {
58268 switch (Constraint[0]) {
58269 default:
58270 break;
58271 case 'Y':
58272 switch (Constraint[1]) {
58273 default:
58274 break;
58275 case 'z':
58276 return C_Register;
58277 case 'i':
58278 case 'm':
58279 case 'k':
58280 case 't':
58281 case '2':
58282 return C_RegisterClass;
58283 }
58284 }
58285 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58286 return C_Other;
58287 return TargetLowering::getConstraintType(Constraint);
58288}
58289
58290/// Examine constraint type and operand type and determine a weight value.
58291/// This object must already have been set up with the operand type
58292/// and the current alternative constraint selected.
58293TargetLowering::ConstraintWeight
58294 X86TargetLowering::getSingleConstraintMatchWeight(
58295 AsmOperandInfo &info, const char *constraint) const {
58296 ConstraintWeight weight = CW_Invalid;
58297 Value *CallOperandVal = info.CallOperandVal;
58298 // If we don't have a value, we can't do a match,
58299 // but allow it at the lowest weight.
58300 if (!CallOperandVal)
58301 return CW_Default;
58302 Type *type = CallOperandVal->getType();
58303 // Look at the constraint type.
58304 switch (*constraint) {
58305 default:
58306 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
58307 [[fallthrough]];
58308 case 'R':
58309 case 'q':
58310 case 'Q':
58311 case 'a':
58312 case 'b':
58313 case 'c':
58314 case 'd':
58315 case 'S':
58316 case 'D':
58317 case 'A':
58318 if (CallOperandVal->getType()->isIntegerTy())
58319 weight = CW_SpecificReg;
58320 break;
58321 case 'f':
58322 case 't':
58323 case 'u':
58324 if (type->isFloatingPointTy())
58325 weight = CW_SpecificReg;
58326 break;
58327 case 'y':
58328 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58329 weight = CW_SpecificReg;
58330 break;
58331 case 'Y':
58332 if (StringRef(constraint).size() != 2)
58333 break;
58334 switch (constraint[1]) {
58335 default:
58336 return CW_Invalid;
58337 // XMM0
58338 case 'z':
58339 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58340 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58341 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58342 return CW_SpecificReg;
58343 return CW_Invalid;
58344 // Conditional OpMask regs (AVX512)
58345 case 'k':
58346 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58347 return CW_Register;
58348 return CW_Invalid;
58349 // Any MMX reg
58350 case 'm':
58351 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58352 return weight;
58353 return CW_Invalid;
58354 // Any SSE reg when ISA >= SSE2, same as 'x'
58355 case 'i':
58356 case 't':
58357 case '2':
58358 if (!Subtarget.hasSSE2())
58359 return CW_Invalid;
58360 break;
58361 }
58362 break;
58363 case 'v':
58364 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58365 weight = CW_Register;
58366 [[fallthrough]];
58367 case 'x':
58368 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58369 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58370 weight = CW_Register;
58371 break;
58372 case 'k':
58373 // Enable conditional vector operations using %k<#> registers.
58374 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58375 weight = CW_Register;
58376 break;
58377 case 'I':
58378 if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
58379 if (C->getZExtValue() <= 31)
58380 weight = CW_Constant;
58381 }
58382 break;
58383 case 'J':
58384 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58385 if (C->getZExtValue() <= 63)
58386 weight = CW_Constant;
58387 }
58388 break;
58389 case 'K':
58390 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58391 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58392 weight = CW_Constant;
58393 }
58394 break;
58395 case 'L':
58396 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58397 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58398 weight = CW_Constant;
58399 }
58400 break;
58401 case 'M':
58402 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58403 if (C->getZExtValue() <= 3)
58404 weight = CW_Constant;
58405 }
58406 break;
58407 case 'N':
58408 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58409 if (C->getZExtValue() <= 0xff)
58410 weight = CW_Constant;
58411 }
58412 break;
58413 case 'G':
58414 case 'C':
58415 if (isa<ConstantFP>(CallOperandVal)) {
58416 weight = CW_Constant;
58417 }
58418 break;
58419 case 'e':
58420 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58421 if ((C->getSExtValue() >= -0x80000000LL) &&
58422 (C->getSExtValue() <= 0x7fffffffLL))
58423 weight = CW_Constant;
58424 }
58425 break;
58426 case 'Z':
58427 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58428 if (C->getZExtValue() <= 0xffffffff)
58429 weight = CW_Constant;
58430 }
58431 break;
58432 }
58433 return weight;
58434}
58435
58436/// Try to replace an X constraint, which matches anything, with another that
58437/// has more specific requirements based on the type of the corresponding
58438/// operand.
58439const char *X86TargetLowering::
58440LowerXConstraint(EVT ConstraintVT) const {
58441 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
58442 // 'f' like normal targets.
58443 if (ConstraintVT.isFloatingPoint()) {
58444 if (Subtarget.hasSSE1())
58445 return "x";
58446 }
58447
58448 return TargetLowering::LowerXConstraint(ConstraintVT);
58449}
58450
58451// Lower @cc targets via setcc.
58452SDValue X86TargetLowering::LowerAsmOutputForConstraint(
58453 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
58454 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
58455 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
58456 if (Cond == X86::COND_INVALID)
58457 return SDValue();
58458 // Check that return type is valid.
58459 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
58460 OpInfo.ConstraintVT.getSizeInBits() < 8)
58461 report_fatal_error("Glue output operand is of invalid type");
58462
58463 // Get EFLAGS register. Only update chain when copyfrom is glued.
58464 if (Glue.getNode()) {
58465 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
58466 Chain = Glue.getValue(1);
58467 } else
58468 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
58469 // Extract CC code.
58470 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
58471 // Extend to 32-bits
58472 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
58473
58474 return Result;
58475}
58476
58477/// Lower the specified operand into the Ops vector.
58478/// If it is invalid, don't add anything to Ops.
58479void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
58480 std::string &Constraint,
58481 std::vector<SDValue>&Ops,
58482 SelectionDAG &DAG) const {
58483 SDValue Result;
58484
58485 // Only support length 1 constraints for now.
58486 if (Constraint.length() > 1) return;
58487
58488 char ConstraintLetter = Constraint[0];
58489 switch (ConstraintLetter) {
58490 default: break;
58491 case 'I':
58492 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58493 if (C->getZExtValue() <= 31) {
58494 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58495 Op.getValueType());
58496 break;
58497 }
58498 }
58499 return;
58500 case 'J':
58501 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58502 if (C->getZExtValue() <= 63) {
58503 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58504 Op.getValueType());
58505 break;
58506 }
58507 }
58508 return;
58509 case 'K':
58510 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58511 if (isInt<8>(C->getSExtValue())) {
58512 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58513 Op.getValueType());
58514 break;
58515 }
58516 }
58517 return;
58518 case 'L':
58519 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58520 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58521 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58522 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58523 Op.getValueType());
58524 break;
58525 }
58526 }
58527 return;
58528 case 'M':
58529 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58530 if (C->getZExtValue() <= 3) {
58531 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58532 Op.getValueType());
58533 break;
58534 }
58535 }
58536 return;
58537 case 'N':
58538 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58539 if (C->getZExtValue() <= 255) {
58540 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58541 Op.getValueType());
58542 break;
58543 }
58544 }
58545 return;
58546 case 'O':
58547 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58548 if (C->getZExtValue() <= 127) {
58549 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58550 Op.getValueType());
58551 break;
58552 }
58553 }
58554 return;
58555 case 'e': {
58556 // 32-bit signed value
58557 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58558 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58559 C->getSExtValue())) {
58560 // Widen to 64 bits here to get it sign extended.
58561 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58562 break;
58563 }
58564 // FIXME gcc accepts some relocatable values here too, but only in certain
58565 // memory models; it's complicated.
58566 }
58567 return;
58568 }
58569 case 'Z': {
58570 // 32-bit unsigned value
58571 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58572 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58573 C->getZExtValue())) {
58574 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58575 Op.getValueType());
58576 break;
58577 }
58578 }
58579 // FIXME gcc accepts some relocatable values here too, but only in certain
58580 // memory models; it's complicated.
58581 return;
58582 }
58583 case 'i': {
58584 // Literal immediates are always ok.
58585 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
58586 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58587 BooleanContent BCont = getBooleanContents(MVT::i64);
58588 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
58589 : ISD::SIGN_EXTEND;
58590 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58591 : CST->getSExtValue();
58592 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
58593 break;
58594 }
58595
58596 // In any sort of PIC mode addresses need to be computed at runtime by
58597 // adding in a register or some sort of table lookup. These can't
58598 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
58599 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
58600 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
58601 return;
58602
58603 // If we are in non-pic codegen mode, we allow the address of a global (with
58604 // an optional displacement) to be used with 'i'.
58605 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58606 // If we require an extra load to get this address, as in PIC mode, we
58607 // can't accept it.
58608 if (isGlobalStubReference(
58609 Subtarget.classifyGlobalReference(GA->getGlobal())))
58610 return;
58611 break;
58612 }
58613 }
58614
58615 if (Result.getNode()) {
58616 Ops.push_back(Result);
58617 return;
58618 }
58619 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
58620}
58621
58622/// Check if \p RC is a general purpose register class.
58623/// I.e., GR* or one of their variant.
58624static bool isGRClass(const TargetRegisterClass &RC) {
58625 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
58626 RC.hasSuperClassEq(&X86::GR16RegClass) ||
58627 RC.hasSuperClassEq(&X86::GR32RegClass) ||
58628 RC.hasSuperClassEq(&X86::GR64RegClass) ||
58629 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
58630}
58631
58632/// Check if \p RC is a vector register class.
58633/// I.e., FR* / VR* or one of their variant.
58634static bool isFRClass(const TargetRegisterClass &RC) {
58635 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
58636 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
58637 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
58638 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
58639 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
58640 RC.hasSuperClassEq(&X86::VR512RegClass);
58641}
58642
58643/// Check if \p RC is a mask register class.
58644/// I.e., VK* or one of their variant.
58645static bool isVKClass(const TargetRegisterClass &RC) {
58646 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
58647 RC.hasSuperClassEq(&X86::VK2RegClass) ||
58648 RC.hasSuperClassEq(&X86::VK4RegClass) ||
58649 RC.hasSuperClassEq(&X86::VK8RegClass) ||
58650 RC.hasSuperClassEq(&X86::VK16RegClass) ||
58651 RC.hasSuperClassEq(&X86::VK32RegClass) ||
58652 RC.hasSuperClassEq(&X86::VK64RegClass);
58653}
58654
58655std::pair<unsigned, const TargetRegisterClass *>
58656X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
58657 StringRef Constraint,
58658 MVT VT) const {
58659 // First, see if this is a constraint that directly corresponds to an LLVM
58660 // register class.
58661 if (Constraint.size() == 1) {
58662 // GCC Constraint Letters
58663 switch (Constraint[0]) {
58664 default: break;
58665 // 'A' means [ER]AX + [ER]DX.
58666 case 'A':
58667 if (Subtarget.is64Bit())
58668 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
58669 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__
__PRETTY_FUNCTION__))
58670 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__
__PRETTY_FUNCTION__))
;
58671 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58672
58673 // TODO: Slight differences here in allocation order and leaving
58674 // RIP in the class. Do they matter any more here than they do
58675 // in the normal allocation?
58676 case 'k':
58677 if (Subtarget.hasAVX512()) {
58678 if (VT == MVT::i1)
58679 return std::make_pair(0U, &X86::VK1RegClass);
58680 if (VT == MVT::i8)
58681 return std::make_pair(0U, &X86::VK8RegClass);
58682 if (VT == MVT::i16)
58683 return std::make_pair(0U, &X86::VK16RegClass);
58684 }
58685 if (Subtarget.hasBWI()) {
58686 if (VT == MVT::i32)
58687 return std::make_pair(0U, &X86::VK32RegClass);
58688 if (VT == MVT::i64)
58689 return std::make_pair(0U, &X86::VK64RegClass);
58690 }
58691 break;
58692 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
58693 if (Subtarget.is64Bit()) {
58694 if (VT == MVT::i8 || VT == MVT::i1)
58695 return std::make_pair(0U, &X86::GR8RegClass);
58696 if (VT == MVT::i16)
58697 return std::make_pair(0U, &X86::GR16RegClass);
58698 if (VT == MVT::i32 || VT == MVT::f32)
58699 return std::make_pair(0U, &X86::GR32RegClass);
58700 if (VT != MVT::f80 && !VT.isVector())
58701 return std::make_pair(0U, &X86::GR64RegClass);
58702 break;
58703 }
58704 [[fallthrough]];
58705 // 32-bit fallthrough
58706 case 'Q': // Q_REGS
58707 if (VT == MVT::i8 || VT == MVT::i1)
58708 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
58709 if (VT == MVT::i16)
58710 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
58711 if (VT == MVT::i32 || VT == MVT::f32 ||
58712 (!VT.isVector() && !Subtarget.is64Bit()))
58713 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
58714 if (VT != MVT::f80 && !VT.isVector())
58715 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
58716 break;
58717 case 'r': // GENERAL_REGS
58718 case 'l': // INDEX_REGS
58719 if (VT == MVT::i8 || VT == MVT::i1)
58720 return std::make_pair(0U, &X86::GR8RegClass);
58721 if (VT == MVT::i16)
58722 return std::make_pair(0U, &X86::GR16RegClass);
58723 if (VT == MVT::i32 || VT == MVT::f32 ||
58724 (!VT.isVector() && !Subtarget.is64Bit()))
58725 return std::make_pair(0U, &X86::GR32RegClass);
58726 if (VT != MVT::f80 && !VT.isVector())
58727 return std::make_pair(0U, &X86::GR64RegClass);
58728 break;
58729 case 'R': // LEGACY_REGS
58730 if (VT == MVT::i8 || VT == MVT::i1)
58731 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
58732 if (VT == MVT::i16)
58733 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
58734 if (VT == MVT::i32 || VT == MVT::f32 ||
58735 (!VT.isVector() && !Subtarget.is64Bit()))
58736 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
58737 if (VT != MVT::f80 && !VT.isVector())
58738 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
58739 break;
58740 case 'f': // FP Stack registers.
58741 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
58742 // value to the correct fpstack register class.
58743 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
58744 return std::make_pair(0U, &X86::RFP32RegClass);
58745 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
58746 return std::make_pair(0U, &X86::RFP64RegClass);
58747 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
58748 return std::make_pair(0U, &X86::RFP80RegClass);
58749 break;
58750 case 'y': // MMX_REGS if MMX allowed.
58751 if (!Subtarget.hasMMX()) break;
58752 return std::make_pair(0U, &X86::VR64RegClass);
58753 case 'v':
58754 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
58755 if (!Subtarget.hasSSE1()) break;
58756 bool VConstraint = (Constraint[0] == 'v');
58757
58758 switch (VT.SimpleTy) {
58759 default: break;
58760 // Scalar SSE types.
58761 case MVT::f16:
58762 if (VConstraint && Subtarget.hasFP16())
58763 return std::make_pair(0U, &X86::FR16XRegClass);
58764 break;
58765 case MVT::f32:
58766 case MVT::i32:
58767 if (VConstraint && Subtarget.hasVLX())
58768 return std::make_pair(0U, &X86::FR32XRegClass);
58769 return std::make_pair(0U, &X86::FR32RegClass);
58770 case MVT::f64:
58771 case MVT::i64:
58772 if (VConstraint && Subtarget.hasVLX())
58773 return std::make_pair(0U, &X86::FR64XRegClass);
58774 return std::make_pair(0U, &X86::FR64RegClass);
58775 case MVT::i128:
58776 if (Subtarget.is64Bit()) {
58777 if (VConstraint && Subtarget.hasVLX())
58778 return std::make_pair(0U, &X86::VR128XRegClass);
58779 return std::make_pair(0U, &X86::VR128RegClass);
58780 }
58781 break;
58782 // Vector types and fp128.
58783 case MVT::v8f16:
58784 if (!Subtarget.hasFP16())
58785 break;
58786 [[fallthrough]];
58787 case MVT::f128:
58788 case MVT::v16i8:
58789 case MVT::v8i16:
58790 case MVT::v4i32:
58791 case MVT::v2i64:
58792 case MVT::v4f32:
58793 case MVT::v2f64:
58794 if (VConstraint && Subtarget.hasVLX())
58795 return std::make_pair(0U, &X86::VR128XRegClass);
58796 return std::make_pair(0U, &X86::VR128RegClass);
58797 // AVX types.
58798 case MVT::v16f16:
58799 if (!Subtarget.hasFP16())
58800 break;
58801 [[fallthrough]];
58802 case MVT::v32i8:
58803 case MVT::v16i16:
58804 case MVT::v8i32:
58805 case MVT::v4i64:
58806 case MVT::v8f32:
58807 case MVT::v4f64:
58808 if (VConstraint && Subtarget.hasVLX())
58809 return std::make_pair(0U, &X86::VR256XRegClass);
58810 if (Subtarget.hasAVX())
58811 return std::make_pair(0U, &X86::VR256RegClass);
58812 break;
58813 case MVT::v32f16:
58814 if (!Subtarget.hasFP16())
58815 break;
58816 [[fallthrough]];
58817 case MVT::v64i8:
58818 case MVT::v32i16:
58819 case MVT::v8f64:
58820 case MVT::v16f32:
58821 case MVT::v16i32:
58822 case MVT::v8i64:
58823 if (!Subtarget.hasAVX512()) break;
58824 if (VConstraint)
58825 return std::make_pair(0U, &X86::VR512RegClass);
58826 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58827 }
58828 break;
58829 }
58830 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
58831 switch (Constraint[1]) {
58832 default:
58833 break;
58834 case 'i':
58835 case 't':
58836 case '2':
58837 return getRegForInlineAsmConstraint(TRI, "x", VT);
58838 case 'm':
58839 if (!Subtarget.hasMMX()) break;
58840 return std::make_pair(0U, &X86::VR64RegClass);
58841 case 'z':
58842 if (!Subtarget.hasSSE1()) break;
58843 switch (VT.SimpleTy) {
58844 default: break;
58845 // Scalar SSE types.
58846 case MVT::f16:
58847 if (!Subtarget.hasFP16())
58848 break;
58849 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58850 case MVT::f32:
58851 case MVT::i32:
58852 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58853 case MVT::f64:
58854 case MVT::i64:
58855 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58856 case MVT::v8f16:
58857 if (!Subtarget.hasFP16())
58858 break;
58859 [[fallthrough]];
58860 case MVT::f128:
58861 case MVT::v16i8:
58862 case MVT::v8i16:
58863 case MVT::v4i32:
58864 case MVT::v2i64:
58865 case MVT::v4f32:
58866 case MVT::v2f64:
58867 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58868 // AVX types.
58869 case MVT::v16f16:
58870 if (!Subtarget.hasFP16())
58871 break;
58872 [[fallthrough]];
58873 case MVT::v32i8:
58874 case MVT::v16i16:
58875 case MVT::v8i32:
58876 case MVT::v4i64:
58877 case MVT::v8f32:
58878 case MVT::v4f64:
58879 if (Subtarget.hasAVX())
58880 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58881 break;
58882 case MVT::v32f16:
58883 if (!Subtarget.hasFP16())
58884 break;
58885 [[fallthrough]];
58886 case MVT::v64i8:
58887 case MVT::v32i16:
58888 case MVT::v8f64:
58889 case MVT::v16f32:
58890 case MVT::v16i32:
58891 case MVT::v8i64:
58892 if (Subtarget.hasAVX512())
58893 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58894 break;
58895 }
58896 break;
58897 case 'k':
58898 // This register class doesn't allocate k0 for masked vector operation.
58899 if (Subtarget.hasAVX512()) {
58900 if (VT == MVT::i1)
58901 return std::make_pair(0U, &X86::VK1WMRegClass);
58902 if (VT == MVT::i8)
58903 return std::make_pair(0U, &X86::VK8WMRegClass);
58904 if (VT == MVT::i16)
58905 return std::make_pair(0U, &X86::VK16WMRegClass);
58906 }
58907 if (Subtarget.hasBWI()) {
58908 if (VT == MVT::i32)
58909 return std::make_pair(0U, &X86::VK32WMRegClass);
58910 if (VT == MVT::i64)
58911 return std::make_pair(0U, &X86::VK64WMRegClass);
58912 }
58913 break;
58914 }
58915 }
58916
58917 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58918 return std::make_pair(0U, &X86::GR32RegClass);
58919
58920 // Use the default implementation in TargetLowering to convert the register
58921 // constraint into a member of a register class.
58922 std::pair<Register, const TargetRegisterClass*> Res;
58923 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
58924
58925 // Not found as a standard register?
58926 if (!Res.second) {
58927 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58928 // to/from f80.
58929 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58930 // Map st(0) -> st(7) -> ST0
58931 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58932 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58933 Constraint[3] == '(' &&
58934 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58935 Constraint[5] == ')' && Constraint[6] == '}') {
58936 // st(7) is not allocatable and thus not a member of RFP80. Return
58937 // singleton class in cases where we have a reference to it.
58938 if (Constraint[4] == '7')
58939 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58940 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58941 &X86::RFP80RegClass);
58942 }
58943
58944 // GCC allows "st(0)" to be called just plain "st".
58945 if (StringRef("{st}").equals_insensitive(Constraint))
58946 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58947 }
58948
58949 // flags -> EFLAGS
58950 if (StringRef("{flags}").equals_insensitive(Constraint))
58951 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58952
58953 // dirflag -> DF
58954 // Only allow for clobber.
58955 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58956 VT == MVT::Other)
58957 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58958
58959 // fpsr -> FPSW
58960 if (StringRef("{fpsr}").equals_insensitive(Constraint))
58961 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58962
58963 return Res;
58964 }
58965
58966 // Make sure it isn't a register that requires 64-bit mode.
58967 if (!Subtarget.is64Bit() &&
58968 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58969 TRI->getEncodingValue(Res.first) >= 8) {
58970 // Register requires REX prefix, but we're in 32-bit mode.
58971 return std::make_pair(0, nullptr);
58972 }
58973
58974 // Make sure it isn't a register that requires AVX512.
58975 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58976 TRI->getEncodingValue(Res.first) & 0x10) {
58977 // Register requires EVEX prefix.
58978 return std::make_pair(0, nullptr);
58979 }
58980
58981 // Otherwise, check to see if this is a register class of the wrong value
58982 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58983 // turn into {ax},{dx}.
58984 // MVT::Other is used to specify clobber names.
58985 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58986 return Res; // Correct type already, nothing to do.
58987
58988 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58989 // return "eax". This should even work for things like getting 64bit integer
58990 // registers when given an f64 type.
58991 const TargetRegisterClass *Class = Res.second;
58992 // The generic code will match the first register class that contains the
58993 // given register. Thus, based on the ordering of the tablegened file,
58994 // the "plain" GR classes might not come first.
58995 // Therefore, use a helper method.
58996 if (isGRClass(*Class)) {
58997 unsigned Size = VT.getSizeInBits();
58998 if (Size == 1) Size = 8;
58999 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
59000 return std::make_pair(0, nullptr);
59001 Register DestReg = getX86SubSuperRegister(Res.first, Size);
59002 if (DestReg.isValid()) {
59003 bool is64Bit = Subtarget.is64Bit();
59004 const TargetRegisterClass *RC =
59005 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
59006 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
59007 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
59008 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
59009 if (Size == 64 && !is64Bit) {
59010 // Model GCC's behavior here and select a fixed pair of 32-bit
59011 // registers.
59012 switch (DestReg) {
59013 case X86::RAX:
59014 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
59015 case X86::RDX:
59016 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
59017 case X86::RCX:
59018 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
59019 case X86::RBX:
59020 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
59021 case X86::RSI:
59022 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
59023 case X86::RDI:
59024 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
59025 case X86::RBP:
59026 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
59027 default:
59028 return std::make_pair(0, nullptr);
59029 }
59030 }
59031 if (RC && RC->contains(DestReg))
59032 return std::make_pair(DestReg, RC);
59033 return Res;
59034 }
59035 // No register found/type mismatch.
59036 return std::make_pair(0, nullptr);
59037 } else if (isFRClass(*Class)) {
59038 // Handle references to XMM physical registers that got mapped into the
59039 // wrong class. This can happen with constraints like {xmm0} where the
59040 // target independent register mapper will just pick the first match it can
59041 // find, ignoring the required type.
59042
59043 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
59044 if (VT == MVT::f16)
59045 Res.second = &X86::FR16XRegClass;
59046 else if (VT == MVT::f32 || VT == MVT::i32)
59047 Res.second = &X86::FR32XRegClass;
59048 else if (VT == MVT::f64 || VT == MVT::i64)
59049 Res.second = &X86::FR64XRegClass;
59050 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
59051 Res.second = &X86::VR128XRegClass;
59052 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
59053 Res.second = &X86::VR256XRegClass;
59054 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
59055 Res.second = &X86::VR512RegClass;
59056 else {
59057 // Type mismatch and not a clobber: Return an error;
59058 Res.first = 0;
59059 Res.second = nullptr;
59060 }
59061 } else if (isVKClass(*Class)) {
59062 if (VT == MVT::i1)
59063 Res.second = &X86::VK1RegClass;
59064 else if (VT == MVT::i8)
59065 Res.second = &X86::VK8RegClass;
59066 else if (VT == MVT::i16)
59067 Res.second = &X86::VK16RegClass;
59068 else if (VT == MVT::i32)
59069 Res.second = &X86::VK32RegClass;
59070 else if (VT == MVT::i64)
59071 Res.second = &X86::VK64RegClass;
59072 else {
59073 // Type mismatch and not a clobber: Return an error;
59074 Res.first = 0;
59075 Res.second = nullptr;
59076 }
59077 }
59078
59079 return Res;
59080}
59081
59082bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
59083 // Integer division on x86 is expensive. However, when aggressively optimizing
59084 // for code size, we prefer to use a div instruction, as it is usually smaller
59085 // than the alternative sequence.
59086 // The exception to this is vector division. Since x86 doesn't have vector
59087 // integer division, leaving the division as-is is a loss even in terms of
59088 // size, because it will have to be scalarized, while the alternative code
59089 // sequence can be performed in vector form.
59090 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
59091 return OptSize && !VT.isVector();
59092}
59093
59094void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
59095 if (!Subtarget.is64Bit())
59096 return;
59097
59098 // Update IsSplitCSR in X86MachineFunctionInfo.
59099 X86MachineFunctionInfo *AFI =
59100 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
59101 AFI->setIsSplitCSR(true);
59102}
59103
59104void X86TargetLowering::insertCopiesSplitCSR(
59105 MachineBasicBlock *Entry,
59106 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
59107 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
59108 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
59109 if (!IStart)
59110 return;
59111
59112 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
59113 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
59114 MachineBasicBlock::iterator MBBI = Entry->begin();
59115 for (const MCPhysReg *I = IStart; *I; ++I) {
59116 const TargetRegisterClass *RC = nullptr;
59117 if (X86::GR64RegClass.contains(*I))
59118 RC = &X86::GR64RegClass;
59119 else
59120 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59120)
;
59121
59122 Register NewVR = MRI->createVirtualRegister(RC);
59123 // Create copy from CSR to a virtual register.
59124 // FIXME: this currently does not emit CFI pseudo-instructions, it works
59125 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
59126 // nounwind. If we want to generalize this later, we may need to emit
59127 // CFI pseudo-instructions.
59128 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))
59129 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))
59130 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))
;
59131 Entry->addLiveIn(*I);
59132 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
59133 .addReg(*I);
59134
59135 // Insert the copy-back instructions right before the terminator.
59136 for (auto *Exit : Exits)
59137 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
59138 TII->get(TargetOpcode::COPY), *I)
59139 .addReg(NewVR);
59140 }
59141}
59142
59143bool X86TargetLowering::supportSwiftError() const {
59144 return Subtarget.is64Bit();
59145}
59146
59147MachineInstr *
59148X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
59149 MachineBasicBlock::instr_iterator &MBBI,
59150 const TargetInstrInfo *TII) const {
59151 assert(MBBI->isCall() && MBBI->getCFIType() &&(static_cast <bool> (MBBI->isCall() && MBBI->
getCFIType() && "Invalid call instruction for a KCFI check"
) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__
__PRETTY_FUNCTION__))
59152 "Invalid call instruction for a KCFI check")(static_cast <bool> (MBBI->isCall() && MBBI->
getCFIType() && "Invalid call instruction for a KCFI check"
) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__
__PRETTY_FUNCTION__))
;
59153
59154 MachineFunction &MF = *MBB.getParent();
59155 // If the call target is a memory operand, unfold it and use R11 for the
59156 // call, so KCFI_CHECK won't have to recompute the address.
59157 switch (MBBI->getOpcode()) {
59158 case X86::CALL64m:
59159 case X86::CALL64m_NT:
59160 case X86::TAILJMPm64:
59161 case X86::TAILJMPm64_REX: {
59162 MachineBasicBlock::instr_iterator OrigCall = MBBI;
59163 SmallVector<MachineInstr *, 2> NewMIs;
59164 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
59165 /*UnfoldStore=*/false, NewMIs))
59166 report_fatal_error("Failed to unfold memory operand for a KCFI check");
59167 for (auto *NewMI : NewMIs)
59168 MBBI = MBB.insert(OrigCall, NewMI);
59169 assert(MBBI->isCall() &&(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding"
) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__
__PRETTY_FUNCTION__))
59170 "Unexpected instruction after memory operand unfolding")(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding"
) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__
__PRETTY_FUNCTION__))
;
59171 if (OrigCall->shouldUpdateCallSiteInfo())
59172 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
59173 MBBI->setCFIType(MF, OrigCall->getCFIType());
59174 OrigCall->eraseFromParent();
59175 break;
59176 }
59177 default:
59178 break;
59179 }
59180
59181 MachineOperand &Target = MBBI->getOperand(0);
59182 Register TargetReg;
59183 switch (MBBI->getOpcode()) {
59184 case X86::CALL64r:
59185 case X86::CALL64r_NT:
59186 case X86::TAILJMPr64:
59187 case X86::TAILJMPr64_REX:
59188 assert(Target.isReg() && "Unexpected target operand for an indirect call")(static_cast <bool> (Target.isReg() && "Unexpected target operand for an indirect call"
) ? void (0) : __assert_fail ("Target.isReg() && \"Unexpected target operand for an indirect call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59188, __extension__
__PRETTY_FUNCTION__))
;
59189 Target.setIsRenamable(false);
59190 TargetReg = Target.getReg();
59191 break;
59192 case X86::CALL64pcrel32:
59193 case X86::TAILJMPd64:
59194 assert(Target.isSymbol() && "Unexpected target operand for a direct call")(static_cast <bool> (Target.isSymbol() && "Unexpected target operand for a direct call"
) ? void (0) : __assert_fail ("Target.isSymbol() && \"Unexpected target operand for a direct call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59194, __extension__
__PRETTY_FUNCTION__))
;
59195 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
59196 // 64-bit indirect thunk calls.
59197 assert(StringRef(Target.getSymbolName()).endswith("_r11") &&(static_cast <bool> (StringRef(Target.getSymbolName()).
endswith("_r11") && "Unexpected register for an indirect thunk call"
) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__
__PRETTY_FUNCTION__))
59198 "Unexpected register for an indirect thunk call")(static_cast <bool> (StringRef(Target.getSymbolName()).
endswith("_r11") && "Unexpected register for an indirect thunk call"
) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__
__PRETTY_FUNCTION__))
;
59199 TargetReg = X86::R11;
59200 break;
59201 default:
59202 llvm_unreachable("Unexpected CFI call opcode")::llvm::llvm_unreachable_internal("Unexpected CFI call opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59202)
;
59203 break;
59204 }
59205
59206 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(X86::KCFI_CHECK))
59207 .addReg(TargetReg)
59208 .addImm(MBBI->getCFIType())
59209 .getInstr();
59210}
59211
59212/// Returns true if stack probing through a function call is requested.
59213bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
59214 return !getStackProbeSymbolName(MF).empty();
59215}
59216
59217/// Returns true if stack probing through inline assembly is requested.
59218bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
59219
59220 // No inline stack probe for Windows, they have their own mechanism.
59221 if (Subtarget.isOSWindows() ||
59222 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59223 return false;
59224
59225 // If the function specifically requests inline stack probes, emit them.
59226 if (MF.getFunction().hasFnAttribute("probe-stack"))
59227 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
59228 "inline-asm";
59229
59230 return false;
59231}
59232
59233/// Returns the name of the symbol used to emit stack probes or the empty
59234/// string if not applicable.
59235StringRef
59236X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
59237 // Inline Stack probes disable stack probe call
59238 if (hasInlineStackProbe(MF))
59239 return "";
59240
59241 // If the function specifically requests stack probes, emit them.
59242 if (MF.getFunction().hasFnAttribute("probe-stack"))
59243 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
59244
59245 // Generally, if we aren't on Windows, the platform ABI does not include
59246 // support for stack probes, so don't emit them.
59247 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
59248 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59249 return "";
59250
59251 // We need a stack probe to conform to the Windows ABI. Choose the right
59252 // symbol.
59253 if (Subtarget.is64Bit())
59254 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
59255 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
59256}
59257
59258unsigned
59259X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
59260 // The default stack probe size is 4096 if the function has no stackprobesize
59261 // attribute.
59262 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
59263 4096);
59264}
59265
59266Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
59267 if (ML->isInnermost() &&
59268 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
59269 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
59270 return TargetLowering::getPrefLoopAlignment();
59271}

/build/source/llvm/include/llvm/CodeGen/SelectionDAGNodes.h

1//===- llvm/CodeGen/SelectionDAGNodes.h - SelectionDAG Nodes ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the SDNode class and derived classes, which are used to
10// represent the nodes and operations present in a SelectionDAG. These nodes
11// and operations are machine code level operations, with some similarities to
12// the GCC RTL representation.
13//
14// Clients should include the SelectionDAG.h file instead of this file directly.
15//
16//===----------------------------------------------------------------------===//
17
18#ifndef LLVM_CODEGEN_SELECTIONDAGNODES_H
19#define LLVM_CODEGEN_SELECTIONDAGNODES_H
20
21#include "llvm/ADT/APFloat.h"
22#include "llvm/ADT/ArrayRef.h"
23#include "llvm/ADT/BitVector.h"
24#include "llvm/ADT/FoldingSet.h"
25#include "llvm/ADT/GraphTraits.h"
26#include "llvm/ADT/SmallPtrSet.h"
27#include "llvm/ADT/SmallVector.h"
28#include "llvm/ADT/ilist_node.h"
29#include "llvm/ADT/iterator.h"
30#include "llvm/ADT/iterator_range.h"
31#include "llvm/CodeGen/ISDOpcodes.h"
32#include "llvm/CodeGen/MachineMemOperand.h"
33#include "llvm/CodeGen/MachineValueType.h"
34#include "llvm/CodeGen/Register.h"
35#include "llvm/CodeGen/ValueTypes.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DebugLoc.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Metadata.h"
41#include "llvm/IR/Operator.h"
42#include "llvm/Support/AlignOf.h"
43#include "llvm/Support/AtomicOrdering.h"
44#include "llvm/Support/Casting.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/TypeSize.h"
47#include <algorithm>
48#include <cassert>
49#include <climits>
50#include <cstddef>
51#include <cstdint>
52#include <cstring>
53#include <iterator>
54#include <string>
55#include <tuple>
56#include <utility>
57
58namespace llvm {
59
60class APInt;
61class Constant;
62class GlobalValue;
63class MachineBasicBlock;
64class MachineConstantPoolValue;
65class MCSymbol;
66class raw_ostream;
67class SDNode;
68class SelectionDAG;
69class Type;
70class Value;
71
72void checkForCycles(const SDNode *N, const SelectionDAG *DAG = nullptr,
73 bool force = false);
74
75/// This represents a list of ValueType's that has been intern'd by
76/// a SelectionDAG. Instances of this simple value class are returned by
77/// SelectionDAG::getVTList(...).
78///
79struct SDVTList {
80 const EVT *VTs;
81 unsigned int NumVTs;
82};
83
84namespace ISD {
85
86 /// Node predicates
87
88/// If N is a BUILD_VECTOR or SPLAT_VECTOR node whose elements are all the
89/// same constant or undefined, return true and return the constant value in
90/// \p SplatValue.
91bool isConstantSplatVector(const SDNode *N, APInt &SplatValue);
92
93/// Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where
94/// all of the elements are ~0 or undef. If \p BuildVectorOnly is set to
95/// true, it only checks BUILD_VECTOR.
96bool isConstantSplatVectorAllOnes(const SDNode *N,
97 bool BuildVectorOnly = false);
98
99/// Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where
100/// all of the elements are 0 or undef. If \p BuildVectorOnly is set to true, it
101/// only checks BUILD_VECTOR.
102bool isConstantSplatVectorAllZeros(const SDNode *N,
103 bool BuildVectorOnly = false);
104
105/// Return true if the specified node is a BUILD_VECTOR where all of the
106/// elements are ~0 or undef.
107bool isBuildVectorAllOnes(const SDNode *N);
108
109/// Return true if the specified node is a BUILD_VECTOR where all of the
110/// elements are 0 or undef.
111bool isBuildVectorAllZeros(const SDNode *N);
112
113/// Return true if the specified node is a BUILD_VECTOR node of all
114/// ConstantSDNode or undef.
115bool isBuildVectorOfConstantSDNodes(const SDNode *N);
116
117/// Return true if the specified node is a BUILD_VECTOR node of all
118/// ConstantFPSDNode or undef.
119bool isBuildVectorOfConstantFPSDNodes(const SDNode *N);
120
121/// Returns true if the specified node is a vector where all elements can
122/// be truncated to the specified element size without a loss in meaning.
123bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed);
124
125/// Return true if the node has at least one operand and all operands of the
126/// specified node are ISD::UNDEF.
127bool allOperandsUndef(const SDNode *N);
128
129/// Return true if the specified node is FREEZE(UNDEF).
130bool isFreezeUndef(const SDNode *N);
131
132} // end namespace ISD
133
134//===----------------------------------------------------------------------===//
135/// Unlike LLVM values, Selection DAG nodes may return multiple
136/// values as the result of a computation. Many nodes return multiple values,
137/// from loads (which define a token and a return value) to ADDC (which returns
138/// a result and a carry value), to calls (which may return an arbitrary number
139/// of values).
140///
141/// As such, each use of a SelectionDAG computation must indicate the node that
142/// computes it as well as which return value to use from that node. This pair
143/// of information is represented with the SDValue value type.
144///
145class SDValue {
146 friend struct DenseMapInfo<SDValue>;
147
148 SDNode *Node = nullptr; // The node defining the value we are using.
4
Null pointer value stored to 'PreservedSrc.Node'
149 unsigned ResNo = 0; // Which return value of the node we are using.
150
151public:
152 SDValue() = default;
5
Returning without writing to 'this->Node'
153 SDValue(SDNode *node, unsigned resno);
154
155 /// get the index which selects a specific result in the SDNode
156 unsigned getResNo() const { return ResNo; }
157
158 /// get the SDNode which holds the desired result
159 SDNode *getNode() const { return Node; }
160
161 /// set the SDNode
162 void setNode(SDNode *N) { Node = N; }
163
164 inline SDNode *operator->() const { return Node; }
165
166 bool operator==(const SDValue &O) const {
167 return Node == O.Node && ResNo == O.ResNo;
168 }
169 bool operator!=(const SDValue &O) const {
170 return !operator==(O);
171 }
172 bool operator<(const SDValue &O) const {
173 return std::tie(Node, ResNo) < std::tie(O.Node, O.ResNo);
174 }
175 explicit operator bool() const {
176 return Node != nullptr;
177 }
178
179 SDValue getValue(unsigned R) const {
180 return SDValue(Node, R);
181 }
182
183 /// Return true if this node is an operand of N.
184 bool isOperandOf(const SDNode *N) const;
185
186 /// Return the ValueType of the referenced return value.
187 inline EVT getValueType() const;
188
189 /// Return the simple ValueType of the referenced return value.
190 MVT getSimpleValueType() const {
191 return getValueType().getSimpleVT();
192 }
193
194 /// Returns the size of the value in bits.
195 ///
196 /// If the value type is a scalable vector type, the scalable property will
197 /// be set and the runtime size will be a positive integer multiple of the
198 /// base size.
199 TypeSize getValueSizeInBits() const {
200 return getValueType().getSizeInBits();
201 }
202
203 uint64_t getScalarValueSizeInBits() const {
204 return getValueType().getScalarType().getFixedSizeInBits();
205 }
206
207 // Forwarding methods - These forward to the corresponding methods in SDNode.
208 inline unsigned getOpcode() const;
209 inline unsigned getNumOperands() const;
210 inline const SDValue &getOperand(unsigned i) const;
211 inline uint64_t getConstantOperandVal(unsigned i) const;
212 inline const APInt &getConstantOperandAPInt(unsigned i) const;
213 inline bool isTargetMemoryOpcode() const;
214 inline bool isTargetOpcode() const;
215 inline bool isMachineOpcode() const;
216 inline bool isUndef() const;
217 inline unsigned getMachineOpcode() const;
218 inline const DebugLoc &getDebugLoc() const;
219 inline void dump() const;
220 inline void dump(const SelectionDAG *G) const;
221 inline void dumpr() const;
222 inline void dumpr(const SelectionDAG *G) const;
223
224 /// Return true if this operand (which must be a chain) reaches the
225 /// specified operand without crossing any side-effecting instructions.
226 /// In practice, this looks through token factors and non-volatile loads.
227 /// In order to remain efficient, this only
228 /// looks a couple of nodes in, it does not do an exhaustive search.
229 bool reachesChainWithoutSideEffects(SDValue Dest,
230 unsigned Depth = 2) const;
231
232 /// Return true if there are no nodes using value ResNo of Node.
233 inline bool use_empty() const;
234
235 /// Return true if there is exactly one node using value ResNo of Node.
236 inline bool hasOneUse() const;
237};
238
239template<> struct DenseMapInfo<SDValue> {
240 static inline SDValue getEmptyKey() {
241 SDValue V;
242 V.ResNo = -1U;
243 return V;
244 }
245
246 static inline SDValue getTombstoneKey() {
247 SDValue V;
248 V.ResNo = -2U;
249 return V;
250 }
251
252 static unsigned getHashValue(const SDValue &Val) {
253 return ((unsigned)((uintptr_t)Val.getNode() >> 4) ^
254 (unsigned)((uintptr_t)Val.getNode() >> 9)) + Val.getResNo();
255 }
256
257 static bool isEqual(const SDValue &LHS, const SDValue &RHS) {
258 return LHS == RHS;
259 }
260};
261
262/// Allow casting operators to work directly on
263/// SDValues as if they were SDNode*'s.
264template<> struct simplify_type<SDValue> {
265 using SimpleType = SDNode *;
266
267 static SimpleType getSimplifiedValue(SDValue &Val) {
268 return Val.getNode();
269 }
270};
271template<> struct simplify_type<const SDValue> {
272 using SimpleType = /*const*/ SDNode *;
273
274 static SimpleType getSimplifiedValue(const SDValue &Val) {
275 return Val.getNode();
276 }
277};
278
279/// Represents a use of a SDNode. This class holds an SDValue,
280/// which records the SDNode being used and the result number, a
281/// pointer to the SDNode using the value, and Next and Prev pointers,
282/// which link together all the uses of an SDNode.
283///
284class SDUse {
285 /// Val - The value being used.
286 SDValue Val;
287 /// User - The user of this value.
288 SDNode *User = nullptr;
289 /// Prev, Next - Pointers to the uses list of the SDNode referred by
290 /// this operand.
291 SDUse **Prev = nullptr;
292 SDUse *Next = nullptr;
293
294public:
295 SDUse() = default;
296 SDUse(const SDUse &U) = delete;
297 SDUse &operator=(const SDUse &) = delete;
298
299 /// Normally SDUse will just implicitly convert to an SDValue that it holds.
300 operator const SDValue&() const { return Val; }
301
302 /// If implicit conversion to SDValue doesn't work, the get() method returns
303 /// the SDValue.
304 const SDValue &get() const { return Val; }
305
306 /// This returns the SDNode that contains this Use.
307 SDNode *getUser() { return User; }
308 const SDNode *getUser() const { return User; }
309
310 /// Get the next SDUse in the use list.
311 SDUse *getNext() const { return Next; }
312
313 /// Convenience function for get().getNode().
314 SDNode *getNode() const { return Val.getNode(); }
315 /// Convenience function for get().getResNo().
316 unsigned getResNo() const { return Val.getResNo(); }
317 /// Convenience function for get().getValueType().
318 EVT getValueType() const { return Val.getValueType(); }
319
320 /// Convenience function for get().operator==
321 bool operator==(const SDValue &V) const {
322 return Val == V;
323 }
324
325 /// Convenience function for get().operator!=
326 bool operator!=(const SDValue &V) const {
327 return Val != V;
328 }
329
330 /// Convenience function for get().operator<
331 bool operator<(const SDValue &V) const {
332 return Val < V;
333 }
334
335private:
336 friend class SelectionDAG;
337 friend class SDNode;
338 // TODO: unfriend HandleSDNode once we fix its operand handling.
339 friend class HandleSDNode;
340
341 void setUser(SDNode *p) { User = p; }
342
343 /// Remove this use from its existing use list, assign it the
344 /// given value, and add it to the new value's node's use list.
345 inline void set(const SDValue &V);
346 /// Like set, but only supports initializing a newly-allocated
347 /// SDUse with a non-null value.
348 inline void setInitial(const SDValue &V);
349 /// Like set, but only sets the Node portion of the value,
350 /// leaving the ResNo portion unmodified.
351 inline void setNode(SDNode *N);
352
353 void addToList(SDUse **List) {
354 Next = *List;
355 if (Next) Next->Prev = &Next;
356 Prev = List;
357 *List = this;
358 }
359
360 void removeFromList() {
361 *Prev = Next;
362 if (Next) Next->Prev = Prev;
363 }
364};
365
366/// simplify_type specializations - Allow casting operators to work directly on
367/// SDValues as if they were SDNode*'s.
368template<> struct simplify_type<SDUse> {
369 using SimpleType = SDNode *;
370
371 static SimpleType getSimplifiedValue(SDUse &Val) {
372 return Val.getNode();
373 }
374};
375
376/// These are IR-level optimization flags that may be propagated to SDNodes.
377/// TODO: This data structure should be shared by the IR optimizer and the
378/// the backend.
379struct SDNodeFlags {
380private:
381 bool NoUnsignedWrap : 1;
382 bool NoSignedWrap : 1;
383 bool Exact : 1;
384 bool NoNaNs : 1;
385 bool NoInfs : 1;
386 bool NoSignedZeros : 1;
387 bool AllowReciprocal : 1;
388 bool AllowContract : 1;
389 bool ApproximateFuncs : 1;
390 bool AllowReassociation : 1;
391
392 // We assume instructions do not raise floating-point exceptions by default,
393 // and only those marked explicitly may do so. We could choose to represent
394 // this via a positive "FPExcept" flags like on the MI level, but having a
395 // negative "NoFPExcept" flag here (that defaults to true) makes the flag
396 // intersection logic more straightforward.
397 bool NoFPExcept : 1;
398
399public:
400 /// Default constructor turns off all optimization flags.
401 SDNodeFlags()
402 : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NoNaNs(false),
403 NoInfs(false), NoSignedZeros(false), AllowReciprocal(false),
404 AllowContract(false), ApproximateFuncs(false),
405 AllowReassociation(false), NoFPExcept(false) {}
406
407 /// Propagate the fast-math-flags from an IR FPMathOperator.
408 void copyFMF(const FPMathOperator &FPMO) {
409 setNoNaNs(FPMO.hasNoNaNs());
410 setNoInfs(FPMO.hasNoInfs());
411 setNoSignedZeros(FPMO.hasNoSignedZeros());
412 setAllowReciprocal(FPMO.hasAllowReciprocal());
413 setAllowContract(FPMO.hasAllowContract());
414 setApproximateFuncs(FPMO.hasApproxFunc());
415 setAllowReassociation(FPMO.hasAllowReassoc());
416 }
417
418 // These are mutators for each flag.
419 void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; }
420 void setNoSignedWrap(bool b) { NoSignedWrap = b; }
421 void setExact(bool b) { Exact = b; }
422 void setNoNaNs(bool b) { NoNaNs = b; }
423 void setNoInfs(bool b) { NoInfs = b; }
424 void setNoSignedZeros(bool b) { NoSignedZeros = b; }
425 void setAllowReciprocal(bool b) { AllowReciprocal = b; }
426 void setAllowContract(bool b) { AllowContract = b; }
427 void setApproximateFuncs(bool b) { ApproximateFuncs = b; }
428 void setAllowReassociation(bool b) { AllowReassociation = b; }
429 void setNoFPExcept(bool b) { NoFPExcept = b; }
430
431 // These are accessors for each flag.
432 bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
433 bool hasNoSignedWrap() const { return NoSignedWrap; }
434 bool hasExact() const { return Exact; }
435 bool hasNoNaNs() const { return NoNaNs; }
436 bool hasNoInfs() const { return NoInfs; }
437 bool hasNoSignedZeros() const { return NoSignedZeros; }
438 bool hasAllowReciprocal() const { return AllowReciprocal; }
439 bool hasAllowContract() const { return AllowContract; }
440 bool hasApproximateFuncs() const { return ApproximateFuncs; }
441 bool hasAllowReassociation() const { return AllowReassociation; }
442 bool hasNoFPExcept() const { return NoFPExcept; }
443
444 /// Clear any flags in this flag set that aren't also set in Flags. All
445 /// flags will be cleared if Flags are undefined.
446 void intersectWith(const SDNodeFlags Flags) {
447 NoUnsignedWrap &= Flags.NoUnsignedWrap;
448 NoSignedWrap &= Flags.NoSignedWrap;
449 Exact &= Flags.Exact;
450 NoNaNs &= Flags.NoNaNs;
451 NoInfs &= Flags.NoInfs;
452 NoSignedZeros &= Flags.NoSignedZeros;
453 AllowReciprocal &= Flags.AllowReciprocal;
454 AllowContract &= Flags.AllowContract;
455 ApproximateFuncs &= Flags.ApproximateFuncs;
456 AllowReassociation &= Flags.AllowReassociation;
457 NoFPExcept &= Flags.NoFPExcept;
458 }
459};
460
461/// Represents one node in the SelectionDAG.
462///
463class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
464private:
465 /// The operation that this node performs.
466 int32_t NodeType;
467
468public:
469 /// Unique and persistent id per SDNode in the DAG. Used for debug printing.
470 /// We do not place that under `#if LLVM_ENABLE_ABI_BREAKING_CHECKS`
471 /// intentionally because it adds unneeded complexity without noticeable
472 /// benefits (see discussion with @thakis in D120714).
473 uint16_t PersistentId = 0xffff;
474
475protected:
476 // We define a set of mini-helper classes to help us interpret the bits in our
477 // SubclassData. These are designed to fit within a uint16_t so they pack
478 // with PersistentId.
479
480#if defined(_AIX) && (!defined(__GNUC__4) || defined(__clang__1))
481// Except for GCC; by default, AIX compilers store bit-fields in 4-byte words
482// and give the `pack` pragma push semantics.
483#define BEGIN_TWO_BYTE_PACK() _Pragma("pack(2)")pack(2)
484#define END_TWO_BYTE_PACK() _Pragma("pack(pop)")pack(pop)
485#else
486#define BEGIN_TWO_BYTE_PACK()
487#define END_TWO_BYTE_PACK()
488#endif
489
490BEGIN_TWO_BYTE_PACK()
491 class SDNodeBitfields {
492 friend class SDNode;
493 friend class MemIntrinsicSDNode;
494 friend class MemSDNode;
495 friend class SelectionDAG;
496
497 uint16_t HasDebugValue : 1;
498 uint16_t IsMemIntrinsic : 1;
499 uint16_t IsDivergent : 1;
500 };
501 enum { NumSDNodeBits = 3 };
502
503 class ConstantSDNodeBitfields {
504 friend class ConstantSDNode;
505
506 uint16_t : NumSDNodeBits;
507
508 uint16_t IsOpaque : 1;
509 };
510
511 class MemSDNodeBitfields {
512 friend class MemSDNode;
513 friend class MemIntrinsicSDNode;
514 friend class AtomicSDNode;
515
516 uint16_t : NumSDNodeBits;
517
518 uint16_t IsVolatile : 1;
519 uint16_t IsNonTemporal : 1;
520 uint16_t IsDereferenceable : 1;
521 uint16_t IsInvariant : 1;
522 };
523 enum { NumMemSDNodeBits = NumSDNodeBits + 4 };
524
525 class LSBaseSDNodeBitfields {
526 friend class LSBaseSDNode;
527 friend class VPBaseLoadStoreSDNode;
528 friend class MaskedLoadStoreSDNode;
529 friend class MaskedGatherScatterSDNode;
530 friend class VPGatherScatterSDNode;
531
532 uint16_t : NumMemSDNodeBits;
533
534 // This storage is shared between disparate class hierarchies to hold an
535 // enumeration specific to the class hierarchy in use.
536 // LSBaseSDNode => enum ISD::MemIndexedMode
537 // VPLoadStoreBaseSDNode => enum ISD::MemIndexedMode
538 // MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode
539 // VPGatherScatterSDNode => enum ISD::MemIndexType
540 // MaskedGatherScatterSDNode => enum ISD::MemIndexType
541 uint16_t AddressingMode : 3;
542 };
543 enum { NumLSBaseSDNodeBits = NumMemSDNodeBits + 3 };
544
545 class LoadSDNodeBitfields {
546 friend class LoadSDNode;
547 friend class VPLoadSDNode;
548 friend class VPStridedLoadSDNode;
549 friend class MaskedLoadSDNode;
550 friend class MaskedGatherSDNode;
551 friend class VPGatherSDNode;
552
553 uint16_t : NumLSBaseSDNodeBits;
554
555 uint16_t ExtTy : 2; // enum ISD::LoadExtType
556 uint16_t IsExpanding : 1;
557 };
558
559 class StoreSDNodeBitfields {
560 friend class StoreSDNode;
561 friend class VPStoreSDNode;
562 friend class VPStridedStoreSDNode;
563 friend class MaskedStoreSDNode;
564 friend class MaskedScatterSDNode;
565 friend class VPScatterSDNode;
566
567 uint16_t : NumLSBaseSDNodeBits;
568
569 uint16_t IsTruncating : 1;
570 uint16_t IsCompressing : 1;
571 };
572
573 union {
574 char RawSDNodeBits[sizeof(uint16_t)];
575 SDNodeBitfields SDNodeBits;
576 ConstantSDNodeBitfields ConstantSDNodeBits;
577 MemSDNodeBitfields MemSDNodeBits;
578 LSBaseSDNodeBitfields LSBaseSDNodeBits;
579 LoadSDNodeBitfields LoadSDNodeBits;
580 StoreSDNodeBitfields StoreSDNodeBits;
581 };
582END_TWO_BYTE_PACK()
583#undef BEGIN_TWO_BYTE_PACK
584#undef END_TWO_BYTE_PACK
585
586 // RawSDNodeBits must cover the entirety of the union. This means that all of
587 // the union's members must have size <= RawSDNodeBits. We write the RHS as
588 // "2" instead of sizeof(RawSDNodeBits) because MSVC can't handle the latter.
589 static_assert(sizeof(SDNodeBitfields) <= 2, "field too wide");
590 static_assert(sizeof(ConstantSDNodeBitfields) <= 2, "field too wide");
591 static_assert(sizeof(MemSDNodeBitfields) <= 2, "field too wide");
592 static_assert(sizeof(LSBaseSDNodeBitfields) <= 2, "field too wide");
593 static_assert(sizeof(LoadSDNodeBitfields) <= 2, "field too wide");
594 static_assert(sizeof(StoreSDNodeBitfields) <= 2, "field too wide");
595
596private:
597 friend class SelectionDAG;
598 // TODO: unfriend HandleSDNode once we fix its operand handling.
599 friend class HandleSDNode;
600
601 /// Unique id per SDNode in the DAG.
602 int NodeId = -1;
603
604 /// The values that are used by this operation.
605 SDUse *OperandList = nullptr;
606
607 /// The types of the values this node defines. SDNode's may
608 /// define multiple values simultaneously.
609 const EVT *ValueList;
610
611 /// List of uses for this SDNode.
612 SDUse *UseList = nullptr;
613
614 /// The number of entries in the Operand/Value list.
615 unsigned short NumOperands = 0;
616 unsigned short NumValues;
617
618 // The ordering of the SDNodes. It roughly corresponds to the ordering of the
619 // original LLVM instructions.
620 // This is used for turning off scheduling, because we'll forgo
621 // the normal scheduling algorithms and output the instructions according to
622 // this ordering.
623 unsigned IROrder;
624
625 /// Source line information.
626 DebugLoc debugLoc;
627
628 /// Return a pointer to the specified value type.
629 static const EVT *getValueTypeList(EVT VT);
630
631 SDNodeFlags Flags;
632
633 uint32_t CFIType = 0;
634
635public:
636 //===--------------------------------------------------------------------===//
637 // Accessors
638 //
639
640 /// Return the SelectionDAG opcode value for this node. For
641 /// pre-isel nodes (those for which isMachineOpcode returns false), these
642 /// are the opcode values in the ISD and <target>ISD namespaces. For
643 /// post-isel opcodes, see getMachineOpcode.
644 unsigned getOpcode() const { return (unsigned)NodeType; }
645
646 /// Test if this node has a target-specific opcode (in the
647 /// \<target\>ISD namespace).
648 bool isTargetOpcode() const { return NodeType >= ISD::BUILTIN_OP_END; }
649
650 /// Test if this node has a target-specific opcode that may raise
651 /// FP exceptions (in the \<target\>ISD namespace and greater than
652 /// FIRST_TARGET_STRICTFP_OPCODE). Note that all target memory
653 /// opcode are currently automatically considered to possibly raise
654 /// FP exceptions as well.
655 bool isTargetStrictFPOpcode() const {
656 return NodeType >= ISD::FIRST_TARGET_STRICTFP_OPCODE;
657 }
658
659 /// Test if this node has a target-specific
660 /// memory-referencing opcode (in the \<target\>ISD namespace and
661 /// greater than FIRST_TARGET_MEMORY_OPCODE).
662 bool isTargetMemoryOpcode() const {
663 return NodeType >= ISD::FIRST_TARGET_MEMORY_OPCODE;
664 }
665
666 /// Return true if the type of the node type undefined.
667 bool isUndef() const { return NodeType == ISD::UNDEF; }
668
669 /// Test if this node is a memory intrinsic (with valid pointer information).
670 /// INTRINSIC_W_CHAIN and INTRINSIC_VOID nodes are sometimes created for
671 /// non-memory intrinsics (with chains) that are not really instances of
672 /// MemSDNode. For such nodes, we need some extra state to determine the
673 /// proper classof relationship.
674 bool isMemIntrinsic() const {
675 return (NodeType == ISD::INTRINSIC_W_CHAIN ||
676 NodeType == ISD::INTRINSIC_VOID) &&
677 SDNodeBits.IsMemIntrinsic;
678 }
679
680 /// Test if this node is a strict floating point pseudo-op.
681 bool isStrictFPOpcode() {
682 switch (NodeType) {
683 default:
684 return false;
685 case ISD::STRICT_FP16_TO_FP:
686 case ISD::STRICT_FP_TO_FP16:
687#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
688 case ISD::STRICT_##DAGN:
689#include "llvm/IR/ConstrainedOps.def"
690 return true;
691 }
692 }
693
694 /// Test if this node is a vector predication operation.
695 bool isVPOpcode() const { return ISD::isVPOpcode(getOpcode()); }
696
697 /// Test if this node has a post-isel opcode, directly
698 /// corresponding to a MachineInstr opcode.
699 bool isMachineOpcode() const { return NodeType < 0; }
700
701 /// This may only be called if isMachineOpcode returns
702 /// true. It returns the MachineInstr opcode value that the node's opcode
703 /// corresponds to.
704 unsigned getMachineOpcode() const {
705 assert(isMachineOpcode() && "Not a MachineInstr opcode!")(static_cast <bool> (isMachineOpcode() && "Not a MachineInstr opcode!"
) ? void (0) : __assert_fail ("isMachineOpcode() && \"Not a MachineInstr opcode!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 705, __extension__
__PRETTY_FUNCTION__))
;
706 return ~NodeType;
707 }
708
709 bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; }
710 void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; }
711
712 bool isDivergent() const { return SDNodeBits.IsDivergent; }
713
714 /// Return true if there are no uses of this node.
715 bool use_empty() const { return UseList == nullptr; }
716
717 /// Return true if there is exactly one use of this node.
718 bool hasOneUse() const { return hasSingleElement(uses()); }
719
720 /// Return the number of uses of this node. This method takes
721 /// time proportional to the number of uses.
722 size_t use_size() const { return std::distance(use_begin(), use_end()); }
723
724 /// Return the unique node id.
725 int getNodeId() const { return NodeId; }
726
727 /// Set unique node id.
728 void setNodeId(int Id) { NodeId = Id; }
729
730 /// Return the node ordering.
731 unsigned getIROrder() const { return IROrder; }
732
733 /// Set the node ordering.
734 void setIROrder(unsigned Order) { IROrder = Order; }
735
736 /// Return the source location info.
737 const DebugLoc &getDebugLoc() const { return debugLoc; }
738
739 /// Set source location info. Try to avoid this, putting
740 /// it in the constructor is preferable.
741 void setDebugLoc(DebugLoc dl) { debugLoc = std::move(dl); }
742
743 /// This class provides iterator support for SDUse
744 /// operands that use a specific SDNode.
745 class use_iterator {
746 friend class SDNode;
747
748 SDUse *Op = nullptr;
749
750 explicit use_iterator(SDUse *op) : Op(op) {}
751
752 public:
753 using iterator_category = std::forward_iterator_tag;
754 using value_type = SDUse;
755 using difference_type = std::ptrdiff_t;
756 using pointer = value_type *;
757 using reference = value_type &;
758
759 use_iterator() = default;
760 use_iterator(const use_iterator &I) = default;
761 use_iterator &operator=(const use_iterator &) = default;
762
763 bool operator==(const use_iterator &x) const { return Op == x.Op; }
764 bool operator!=(const use_iterator &x) const {
765 return !operator==(x);
766 }
767
768 /// Return true if this iterator is at the end of uses list.
769 bool atEnd() const { return Op == nullptr; }
770
771 // Iterator traversal: forward iteration only.
772 use_iterator &operator++() { // Preincrement
773 assert(Op && "Cannot increment end iterator!")(static_cast <bool> (Op && "Cannot increment end iterator!"
) ? void (0) : __assert_fail ("Op && \"Cannot increment end iterator!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 773, __extension__
__PRETTY_FUNCTION__))
;
774 Op = Op->getNext();
775 return *this;
776 }
777
778 use_iterator operator++(int) { // Postincrement
779 use_iterator tmp = *this; ++*this; return tmp;
780 }
781
782 /// Retrieve a pointer to the current user node.
783 SDNode *operator*() const {
784 assert(Op && "Cannot dereference end iterator!")(static_cast <bool> (Op && "Cannot dereference end iterator!"
) ? void (0) : __assert_fail ("Op && \"Cannot dereference end iterator!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 784, __extension__
__PRETTY_FUNCTION__))
;
785 return Op->getUser();
786 }
787
788 SDNode *operator->() const { return operator*(); }
789
790 SDUse &getUse() const { return *Op; }
791
792 /// Retrieve the operand # of this use in its user.
793 unsigned getOperandNo() const {
794 assert(Op && "Cannot dereference end iterator!")(static_cast <bool> (Op && "Cannot dereference end iterator!"
) ? void (0) : __assert_fail ("Op && \"Cannot dereference end iterator!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 794, __extension__
__PRETTY_FUNCTION__))
;
795 return (unsigned)(Op - Op->getUser()->OperandList);
796 }
797 };
798
799 /// Provide iteration support to walk over all uses of an SDNode.
800 use_iterator use_begin() const {
801 return use_iterator(UseList);
802 }
803
804 static use_iterator use_end() { return use_iterator(nullptr); }
805
806 inline iterator_range<use_iterator> uses() {
807 return make_range(use_begin(), use_end());
808 }
809 inline iterator_range<use_iterator> uses() const {
810 return make_range(use_begin(), use_end());
811 }
812
813 /// Return true if there are exactly NUSES uses of the indicated value.
814 /// This method ignores uses of other values defined by this operation.
815 bool hasNUsesOfValue(unsigned NUses, unsigned Value) const;
816
817 /// Return true if there are any use of the indicated value.
818 /// This method ignores uses of other values defined by this operation.
819 bool hasAnyUseOfValue(unsigned Value) const;
820
821 /// Return true if this node is the only use of N.
822 bool isOnlyUserOf(const SDNode *N) const;
823
824 /// Return true if this node is an operand of N.
825 bool isOperandOf(const SDNode *N) const;
826
827 /// Return true if this node is a predecessor of N.
828 /// NOTE: Implemented on top of hasPredecessor and every bit as
829 /// expensive. Use carefully.
830 bool isPredecessorOf(const SDNode *N) const {
831 return N->hasPredecessor(this);
832 }
833
834 /// Return true if N is a predecessor of this node.
835 /// N is either an operand of this node, or can be reached by recursively
836 /// traversing up the operands.
837 /// NOTE: This is an expensive method. Use it carefully.
838 bool hasPredecessor(const SDNode *N) const;
839
840 /// Returns true if N is a predecessor of any node in Worklist. This
841 /// helper keeps Visited and Worklist sets externally to allow unions
842 /// searches to be performed in parallel, caching of results across
843 /// queries and incremental addition to Worklist. Stops early if N is
844 /// found but will resume. Remember to clear Visited and Worklists
845 /// if DAG changes. MaxSteps gives a maximum number of nodes to visit before
846 /// giving up. The TopologicalPrune flag signals that positive NodeIds are
847 /// topologically ordered (Operands have strictly smaller node id) and search
848 /// can be pruned leveraging this.
849 static bool hasPredecessorHelper(const SDNode *N,
850 SmallPtrSetImpl<const SDNode *> &Visited,
851 SmallVectorImpl<const SDNode *> &Worklist,
852 unsigned int MaxSteps = 0,
853 bool TopologicalPrune = false) {
854 SmallVector<const SDNode *, 8> DeferredNodes;
855 if (Visited.count(N))
856 return true;
857
858 // Node Id's are assigned in three places: As a topological
859 // ordering (> 0), during legalization (results in values set to
860 // 0), new nodes (set to -1). If N has a topolgical id then we
861 // know that all nodes with ids smaller than it cannot be
862 // successors and we need not check them. Filter out all node
863 // that can't be matches. We add them to the worklist before exit
864 // in case of multiple calls. Note that during selection the topological id
865 // may be violated if a node's predecessor is selected before it. We mark
866 // this at selection negating the id of unselected successors and
867 // restricting topological pruning to positive ids.
868
869 int NId = N->getNodeId();
870 // If we Invalidated the Id, reconstruct original NId.
871 if (NId < -1)
872 NId = -(NId + 1);
873
874 bool Found = false;
875 while (!Worklist.empty()) {
876 const SDNode *M = Worklist.pop_back_val();
877 int MId = M->getNodeId();
878 if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) &&
879 (MId > 0) && (MId < NId)) {
880 DeferredNodes.push_back(M);
881 continue;
882 }
883 for (const SDValue &OpV : M->op_values()) {
884 SDNode *Op = OpV.getNode();
885 if (Visited.insert(Op).second)
886 Worklist.push_back(Op);
887 if (Op == N)
888 Found = true;
889 }
890 if (Found)
891 break;
892 if (MaxSteps != 0 && Visited.size() >= MaxSteps)
893 break;
894 }
895 // Push deferred nodes back on worklist.
896 Worklist.append(DeferredNodes.begin(), DeferredNodes.end());
897 // If we bailed early, conservatively return found.
898 if (MaxSteps != 0 && Visited.size() >= MaxSteps)
899 return true;
900 return Found;
901 }
902
903 /// Return true if all the users of N are contained in Nodes.
904 /// NOTE: Requires at least one match, but doesn't require them all.
905 static bool areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N);
906
907 /// Return the number of values used by this operation.
908 unsigned getNumOperands() const { return NumOperands; }
909
910 /// Return the maximum number of operands that a SDNode can hold.
911 static constexpr size_t getMaxNumOperands() {
912 return std::numeric_limits<decltype(SDNode::NumOperands)>::max();
913 }
914
915 /// Helper method returns the integer value of a ConstantSDNode operand.
916 inline uint64_t getConstantOperandVal(unsigned Num) const;
917
918 /// Helper method returns the APInt of a ConstantSDNode operand.
919 inline const APInt &getConstantOperandAPInt(unsigned Num) const;
920
921 const SDValue &getOperand(unsigned Num) const {
922 assert(Num < NumOperands && "Invalid child # of SDNode!")(static_cast <bool> (Num < NumOperands && "Invalid child # of SDNode!"
) ? void (0) : __assert_fail ("Num < NumOperands && \"Invalid child # of SDNode!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 922, __extension__
__PRETTY_FUNCTION__))
;
923 return OperandList[Num];
924 }
925
926 using op_iterator = SDUse *;
927
928 op_iterator op_begin() const { return OperandList; }
929 op_iterator op_end() const { return OperandList+NumOperands; }
930 ArrayRef<SDUse> ops() const { return ArrayRef(op_begin(), op_end()); }
931
932 /// Iterator for directly iterating over the operand SDValue's.
933 struct value_op_iterator
934 : iterator_adaptor_base<value_op_iterator, op_iterator,
935 std::random_access_iterator_tag, SDValue,
936 ptrdiff_t, value_op_iterator *,
937 value_op_iterator *> {
938 explicit value_op_iterator(SDUse *U = nullptr)
939 : iterator_adaptor_base(U) {}
940
941 const SDValue &operator*() const { return I->get(); }
942 };
943
944 iterator_range<value_op_iterator> op_values() const {
945 return make_range(value_op_iterator(op_begin()),
946 value_op_iterator(op_end()));
947 }
948
949 SDVTList getVTList() const {
950 SDVTList X = { ValueList, NumValues };
951 return X;
952 }
953
954 /// If this node has a glue operand, return the node
955 /// to which the glue operand points. Otherwise return NULL.
956 SDNode *getGluedNode() const {
957 if (getNumOperands() != 0 &&
958 getOperand(getNumOperands()-1).getValueType() == MVT::Glue)
959 return getOperand(getNumOperands()-1).getNode();
960 return nullptr;
961 }
962
963 /// If this node has a glue value with a user, return
964 /// the user (there is at most one). Otherwise return NULL.
965 SDNode *getGluedUser() const {
966 for (use_iterator UI = use_begin(), UE = use_end(); UI != UE; ++UI)
967 if (UI.getUse().get().getValueType() == MVT::Glue)
968 return *UI;
969 return nullptr;
970 }
971
972 SDNodeFlags getFlags() const { return Flags; }
973 void setFlags(SDNodeFlags NewFlags) { Flags = NewFlags; }
974
975 /// Clear any flags in this node that aren't also set in Flags.
976 /// If Flags is not in a defined state then this has no effect.
977 void intersectFlagsWith(const SDNodeFlags Flags);
978
979 void setCFIType(uint32_t Type) { CFIType = Type; }
980 uint32_t getCFIType() const { return CFIType; }
981
982 /// Return the number of values defined/returned by this operator.
983 unsigned getNumValues() const { return NumValues; }
984
985 /// Return the type of a specified result.
986 EVT getValueType(unsigned ResNo) const {
987 assert(ResNo < NumValues && "Illegal result number!")(static_cast <bool> (ResNo < NumValues && "Illegal result number!"
) ? void (0) : __assert_fail ("ResNo < NumValues && \"Illegal result number!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 987, __extension__
__PRETTY_FUNCTION__))
;
988 return ValueList[ResNo];
989 }
990
991 /// Return the type of a specified result as a simple type.
992 MVT getSimpleValueType(unsigned ResNo) const {
993 return getValueType(ResNo).getSimpleVT();
994 }
995
996 /// Returns MVT::getSizeInBits(getValueType(ResNo)).
997 ///
998 /// If the value type is a scalable vector type, the scalable property will
999 /// be set and the runtime size will be a positive integer multiple of the
1000 /// base size.
1001 TypeSize getValueSizeInBits(unsigned ResNo) const {
1002 return getValueType(ResNo).getSizeInBits();
1003 }
1004
1005 using value_iterator = const EVT *;
1006
1007 value_iterator value_begin() const { return ValueList; }
1008 value_iterator value_end() const { return ValueList+NumValues; }
1009 iterator_range<value_iterator> values() const {
1010 return llvm::make_range(value_begin(), value_end());
1011 }
1012
1013 /// Return the opcode of this operation for printing.
1014 std::string getOperationName(const SelectionDAG *G = nullptr) const;
1015 static const char* getIndexedModeName(ISD::MemIndexedMode AM);
1016 void print_types(raw_ostream &OS, const SelectionDAG *G) const;
1017 void print_details(raw_ostream &OS, const SelectionDAG *G) const;
1018 void print(raw_ostream &OS, const SelectionDAG *G = nullptr) const;
1019 void printr(raw_ostream &OS, const SelectionDAG *G = nullptr) const;
1020
1021 /// Print a SelectionDAG node and all children down to
1022 /// the leaves. The given SelectionDAG allows target-specific nodes
1023 /// to be printed in human-readable form. Unlike printr, this will
1024 /// print the whole DAG, including children that appear multiple
1025 /// times.
1026 ///
1027 void printrFull(raw_ostream &O, const SelectionDAG *G = nullptr) const;
1028
1029 /// Print a SelectionDAG node and children up to
1030 /// depth "depth." The given SelectionDAG allows target-specific
1031 /// nodes to be printed in human-readable form. Unlike printr, this
1032 /// will print children that appear multiple times wherever they are
1033 /// used.
1034 ///
1035 void printrWithDepth(raw_ostream &O, const SelectionDAG *G = nullptr,
1036 unsigned depth = 100) const;
1037
1038 /// Dump this node, for debugging.
1039 void dump() const;
1040
1041 /// Dump (recursively) this node and its use-def subgraph.
1042 void dumpr() const;
1043
1044 /// Dump this node, for debugging.
1045 /// The given SelectionDAG allows target-specific nodes to be printed
1046 /// in human-readable form.
1047 void dump(const SelectionDAG *G) const;
1048
1049 /// Dump (recursively) this node and its use-def subgraph.
1050 /// The given SelectionDAG allows target-specific nodes to be printed
1051 /// in human-readable form.
1052 void dumpr(const SelectionDAG *G) const;
1053
1054 /// printrFull to dbgs(). The given SelectionDAG allows
1055 /// target-specific nodes to be printed in human-readable form.
1056 /// Unlike dumpr, this will print the whole DAG, including children
1057 /// that appear multiple times.
1058 void dumprFull(const SelectionDAG *G = nullptr) const;
1059
1060 /// printrWithDepth to dbgs(). The given
1061 /// SelectionDAG allows target-specific nodes to be printed in
1062 /// human-readable form. Unlike dumpr, this will print children
1063 /// that appear multiple times wherever they are used.
1064 ///
1065 void dumprWithDepth(const SelectionDAG *G = nullptr,
1066 unsigned depth = 100) const;
1067
1068 /// Gather unique data for the node.
1069 void Profile(FoldingSetNodeID &ID) const;
1070
1071 /// This method should only be used by the SDUse class.
1072 void addUse(SDUse &U) { U.addToList(&UseList); }
1073
1074protected:
1075 static SDVTList getSDVTList(EVT VT) {
1076 SDVTList Ret = { getValueTypeList(VT), 1 };
1077 return Ret;
1078 }
1079
1080 /// Create an SDNode.
1081 ///
1082 /// SDNodes are created without any operands, and never own the operand
1083 /// storage. To add operands, see SelectionDAG::createOperands.
1084 SDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs)
1085 : NodeType(Opc), ValueList(VTs.VTs), NumValues(VTs.NumVTs),
1086 IROrder(Order), debugLoc(std::move(dl)) {
1087 memset(&RawSDNodeBits, 0, sizeof(RawSDNodeBits));
1088 assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor")(static_cast <bool> (debugLoc.hasTrivialDestructor() &&
"Expected trivial destructor") ? void (0) : __assert_fail ("debugLoc.hasTrivialDestructor() && \"Expected trivial destructor\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1088, __extension__
__PRETTY_FUNCTION__))
;
1089 assert(NumValues == VTs.NumVTs &&(static_cast <bool> (NumValues == VTs.NumVTs &&
"NumValues wasn't wide enough for its operands!") ? void (0)
: __assert_fail ("NumValues == VTs.NumVTs && \"NumValues wasn't wide enough for its operands!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1090, __extension__
__PRETTY_FUNCTION__))
1090 "NumValues wasn't wide enough for its operands!")(static_cast <bool> (NumValues == VTs.NumVTs &&
"NumValues wasn't wide enough for its operands!") ? void (0)
: __assert_fail ("NumValues == VTs.NumVTs && \"NumValues wasn't wide enough for its operands!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1090, __extension__
__PRETTY_FUNCTION__))
;
1091 }
1092
1093 /// Release the operands and set this node to have zero operands.
1094 void DropOperands();
1095};
1096
1097/// Wrapper class for IR location info (IR ordering and DebugLoc) to be passed
1098/// into SDNode creation functions.
1099/// When an SDNode is created from the DAGBuilder, the DebugLoc is extracted
1100/// from the original Instruction, and IROrder is the ordinal position of
1101/// the instruction.
1102/// When an SDNode is created after the DAG is being built, both DebugLoc and
1103/// the IROrder are propagated from the original SDNode.
1104/// So SDLoc class provides two constructors besides the default one, one to
1105/// be used by the DAGBuilder, the other to be used by others.
1106class SDLoc {
1107private:
1108 DebugLoc DL;
1109 int IROrder = 0;
1110
1111public:
1112 SDLoc() = default;
1113 SDLoc(const SDNode *N) : DL(N->getDebugLoc()), IROrder(N->getIROrder()) {}
1114 SDLoc(const SDValue V) : SDLoc(V.getNode()) {}
1115 SDLoc(const Instruction *I, int Order) : IROrder(Order) {
1116 assert(Order >= 0 && "bad IROrder")(static_cast <bool> (Order >= 0 && "bad IROrder"
) ? void (0) : __assert_fail ("Order >= 0 && \"bad IROrder\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1116, __extension__
__PRETTY_FUNCTION__))
;
1117 if (I)
1118 DL = I->getDebugLoc();
1119 }
1120
1121 unsigned getIROrder() const { return IROrder; }
1122 const DebugLoc &getDebugLoc() const { return DL; }
1123};
1124
1125// Define inline functions from the SDValue class.
1126
1127inline SDValue::SDValue(SDNode *node, unsigned resno)
1128 : Node(node), ResNo(resno) {
1129 // Explicitly check for !ResNo to avoid use-after-free, because there are
1130 // callers that use SDValue(N, 0) with a deleted N to indicate successful
1131 // combines.
1132 assert((!Node || !ResNo || ResNo < Node->getNumValues()) &&(static_cast <bool> ((!Node || !ResNo || ResNo < Node
->getNumValues()) && "Invalid result number for the given node!"
) ? void (0) : __assert_fail ("(!Node || !ResNo || ResNo < Node->getNumValues()) && \"Invalid result number for the given node!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1133, __extension__
__PRETTY_FUNCTION__))
1133 "Invalid result number for the given node!")(static_cast <bool> ((!Node || !ResNo || ResNo < Node
->getNumValues()) && "Invalid result number for the given node!"
) ? void (0) : __assert_fail ("(!Node || !ResNo || ResNo < Node->getNumValues()) && \"Invalid result number for the given node!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1133, __extension__
__PRETTY_FUNCTION__))
;
1134 assert(ResNo < -2U && "Cannot use result numbers reserved for DenseMaps.")(static_cast <bool> (ResNo < -2U && "Cannot use result numbers reserved for DenseMaps."
) ? void (0) : __assert_fail ("ResNo < -2U && \"Cannot use result numbers reserved for DenseMaps.\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1134, __extension__
__PRETTY_FUNCTION__))
;
1135}
1136
1137inline unsigned SDValue::getOpcode() const {
1138 return Node->getOpcode();
1139}
1140
1141inline EVT SDValue::getValueType() const {
1142 return Node->getValueType(ResNo);
1143}
1144
1145inline unsigned SDValue::getNumOperands() const {
1146 return Node->getNumOperands();
1147}
1148
1149inline const SDValue &SDValue::getOperand(unsigned i) const {
1150 return Node->getOperand(i);
1151}
1152
1153inline uint64_t SDValue::getConstantOperandVal(unsigned i) const {
1154 return Node->getConstantOperandVal(i);
1155}
1156
1157inline const APInt &SDValue::getConstantOperandAPInt(unsigned i) const {
1158 return Node->getConstantOperandAPInt(i);
1159}
1160
1161inline bool SDValue::isTargetOpcode() const {
1162 return Node->isTargetOpcode();
1163}
1164
1165inline bool SDValue::isTargetMemoryOpcode() const {
1166 return Node->isTargetMemoryOpcode();
1167}
1168
1169inline bool SDValue::isMachineOpcode() const {
1170 return Node->isMachineOpcode();
1171}
1172
1173inline unsigned SDValue::getMachineOpcode() const {
1174 return Node->getMachineOpcode();
1175}
1176
1177inline bool SDValue::isUndef() const {
1178 return Node->isUndef();
17
Called C++ object pointer is null
1179}
1180
1181inline bool SDValue::use_empty() const {
1182 return !Node->hasAnyUseOfValue(ResNo);
1183}
1184
1185inline bool SDValue::hasOneUse() const {
1186 return Node->hasNUsesOfValue(1, ResNo);
1187}
1188
1189inline const DebugLoc &SDValue::getDebugLoc() const {
1190 return Node->getDebugLoc();
1191}
1192
1193inline void SDValue::dump() const {
1194 return Node->dump();
1195}
1196
1197inline void SDValue::dump(const SelectionDAG *G) const {
1198 return Node->dump(G);
1199}
1200
1201inline void SDValue::dumpr() const {
1202 return Node->dumpr();
1203}
1204
1205inline void SDValue::dumpr(const SelectionDAG *G) const {
1206 return Node->dumpr(G);
1207}
1208
1209// Define inline functions from the SDUse class.
1210
1211inline void SDUse::set(const SDValue &V) {
1212 if (Val.getNode()) removeFromList();
1213 Val = V;
1214 if (V.getNode())
1215 V->addUse(*this);
1216}
1217
1218inline void SDUse::setInitial(const SDValue &V) {
1219 Val = V;
1220 V->addUse(*this);
1221}
1222
1223inline void SDUse::setNode(SDNode *N) {
1224 if (Val.getNode()) removeFromList();
1225 Val.setNode(N);
1226 if (N) N->addUse(*this);
1227}
1228
1229/// This class is used to form a handle around another node that
1230/// is persistent and is updated across invocations of replaceAllUsesWith on its
1231/// operand. This node should be directly created by end-users and not added to
1232/// the AllNodes list.
1233class HandleSDNode : public SDNode {
1234 SDUse Op;
1235
1236public:
1237 explicit HandleSDNode(SDValue X)
1238 : SDNode(ISD::HANDLENODE, 0, DebugLoc(), getSDVTList(MVT::Other)) {
1239 // HandleSDNodes are never inserted into the DAG, so they won't be
1240 // auto-numbered. Use ID 65535 as a sentinel.
1241 PersistentId = 0xffff;
1242
1243 // Manually set up the operand list. This node type is special in that it's
1244 // always stack allocated and SelectionDAG does not manage its operands.
1245 // TODO: This should either (a) not be in the SDNode hierarchy, or (b) not
1246 // be so special.
1247 Op.setUser(this);
1248 Op.setInitial(X);
1249 NumOperands = 1;
1250 OperandList = &Op;
1251 }
1252 ~HandleSDNode();
1253
1254 const SDValue &getValue() const { return Op; }
1255};
1256
1257class AddrSpaceCastSDNode : public SDNode {
1258private:
1259 unsigned SrcAddrSpace;
1260 unsigned DestAddrSpace;
1261
1262public:
1263 AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl, EVT VT,
1264 unsigned SrcAS, unsigned DestAS);
1265
1266 unsigned getSrcAddressSpace() const { return SrcAddrSpace; }
1267 unsigned getDestAddressSpace() const { return DestAddrSpace; }
1268
1269 static bool classof(const SDNode *N) {
1270 return N->getOpcode() == ISD::ADDRSPACECAST;
1271 }
1272};
1273
1274/// This is an abstract virtual class for memory operations.
1275class MemSDNode : public SDNode {
1276private:
1277 // VT of in-memory value.
1278 EVT MemoryVT;
1279
1280protected:
1281 /// Memory reference information.
1282 MachineMemOperand *MMO;
1283
1284public:
1285 MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl, SDVTList VTs,
1286 EVT memvt, MachineMemOperand *MMO);
1287
1288 bool readMem() const { return MMO->isLoad(); }
1289 bool writeMem() const { return MMO->isStore(); }
1290
1291 /// Returns alignment and volatility of the memory access
1292 Align getOriginalAlign() const { return MMO->getBaseAlign(); }
1293 Align getAlign() const { return MMO->getAlign(); }
1294
1295 /// Return the SubclassData value, without HasDebugValue. This contains an
1296 /// encoding of the volatile flag, as well as bits used by subclasses. This
1297 /// function should only be used to compute a FoldingSetNodeID value.
1298 /// The HasDebugValue bit is masked out because CSE map needs to match
1299 /// nodes with debug info with nodes without debug info. Same is about
1300 /// isDivergent bit.
1301 unsigned getRawSubclassData() const {
1302 uint16_t Data;
1303 union {
1304 char RawSDNodeBits[sizeof(uint16_t)];
1305 SDNodeBitfields SDNodeBits;
1306 };
1307 memcpy(&RawSDNodeBits, &this->RawSDNodeBits, sizeof(this->RawSDNodeBits));
1308 SDNodeBits.HasDebugValue = 0;
1309 SDNodeBits.IsDivergent = false;
1310 memcpy(&Data, &RawSDNodeBits, sizeof(RawSDNodeBits));
1311 return Data;
1312 }
1313
1314 bool isVolatile() const { return MemSDNodeBits.IsVolatile; }
1315 bool isNonTemporal() const { return MemSDNodeBits.IsNonTemporal; }
1316 bool isDereferenceable() const { return MemSDNodeBits.IsDereferenceable; }
1317 bool isInvariant() const { return MemSDNodeBits.IsInvariant; }
1318
1319 // Returns the offset from the location of the access.
1320 int64_t getSrcValueOffset() const { return MMO->getOffset(); }
1321
1322 /// Returns the AA info that describes the dereference.
1323 AAMDNodes getAAInfo() const { return MMO->getAAInfo(); }
1324
1325 /// Returns the Ranges that describes the dereference.
1326 const MDNode *getRanges() const { return MMO->getRanges(); }
1327
1328 /// Returns the synchronization scope ID for this memory operation.
1329 SyncScope::ID getSyncScopeID() const { return MMO->getSyncScopeID(); }
1330
1331 /// Return the atomic ordering requirements for this memory operation. For
1332 /// cmpxchg atomic operations, return the atomic ordering requirements when
1333 /// store occurs.
1334 AtomicOrdering getSuccessOrdering() const {
1335 return MMO->getSuccessOrdering();
1336 }
1337
1338 /// Return a single atomic ordering that is at least as strong as both the
1339 /// success and failure orderings for an atomic operation. (For operations
1340 /// other than cmpxchg, this is equivalent to getSuccessOrdering().)
1341 AtomicOrdering getMergedOrdering() const { return MMO->getMergedOrdering(); }
1342
1343 /// Return true if the memory operation ordering is Unordered or higher.
1344 bool isAtomic() const { return MMO->isAtomic(); }
1345
1346 /// Returns true if the memory operation doesn't imply any ordering
1347 /// constraints on surrounding memory operations beyond the normal memory
1348 /// aliasing rules.
1349 bool isUnordered() const { return MMO->isUnordered(); }
1350
1351 /// Returns true if the memory operation is neither atomic or volatile.
1352 bool isSimple() const { return !isAtomic() && !isVolatile(); }
1353
1354 /// Return the type of the in-memory value.
1355 EVT getMemoryVT() const { return MemoryVT; }
1356
1357 /// Return a MachineMemOperand object describing the memory
1358 /// reference performed by operation.
1359 MachineMemOperand *getMemOperand() const { return MMO; }
1360
1361 const MachinePointerInfo &getPointerInfo() const {
1362 return MMO->getPointerInfo();
1363 }
1364
1365 /// Return the address space for the associated pointer
1366 unsigned getAddressSpace() const {
1367 return getPointerInfo().getAddrSpace();
1368 }
1369
1370 /// Update this MemSDNode's MachineMemOperand information
1371 /// to reflect the alignment of NewMMO, if it has a greater alignment.
1372 /// This must only be used when the new alignment applies to all users of
1373 /// this MachineMemOperand.
1374 void refineAlignment(const MachineMemOperand *NewMMO) {
1375 MMO->refineAlignment(NewMMO);
1376 }
1377
1378 const SDValue &getChain() const { return getOperand(0); }
1379
1380 const SDValue &getBasePtr() const {
1381 switch (getOpcode()) {
1382 case ISD::STORE:
1383 case ISD::VP_STORE:
1384 case ISD::MSTORE:
1385 case ISD::VP_SCATTER:
1386 case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
1387 return getOperand(2);
1388 case ISD::MGATHER:
1389 case ISD::MSCATTER:
1390 return getOperand(3);
1391 default:
1392 return getOperand(1);
1393 }
1394 }
1395
1396 // Methods to support isa and dyn_cast
1397 static bool classof(const SDNode *N) {
1398 // For some targets, we lower some target intrinsics to a MemIntrinsicNode
1399 // with either an intrinsic or a target opcode.
1400 switch (N->getOpcode()) {
1401 case ISD::LOAD:
1402 case ISD::STORE:
1403 case ISD::PREFETCH:
1404 case ISD::ATOMIC_CMP_SWAP:
1405 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
1406 case ISD::ATOMIC_SWAP:
1407 case ISD::ATOMIC_LOAD_ADD:
1408 case ISD::ATOMIC_LOAD_SUB:
1409 case ISD::ATOMIC_LOAD_AND:
1410 case ISD::ATOMIC_LOAD_CLR:
1411 case ISD::ATOMIC_LOAD_OR:
1412 case ISD::ATOMIC_LOAD_XOR:
1413 case ISD::ATOMIC_LOAD_NAND:
1414 case ISD::ATOMIC_LOAD_MIN:
1415 case ISD::ATOMIC_LOAD_MAX:
1416 case ISD::ATOMIC_LOAD_UMIN:
1417 case ISD::ATOMIC_LOAD_UMAX:
1418 case ISD::ATOMIC_LOAD_FADD:
1419 case ISD::ATOMIC_LOAD_FSUB:
1420 case ISD::ATOMIC_LOAD_FMAX:
1421 case ISD::ATOMIC_LOAD_FMIN:
1422 case ISD::ATOMIC_LOAD_UINC_WRAP:
1423 case ISD::ATOMIC_LOAD_UDEC_WRAP:
1424 case ISD::ATOMIC_LOAD:
1425 case ISD::ATOMIC_STORE:
1426 case ISD::MLOAD:
1427 case ISD::MSTORE:
1428 case ISD::MGATHER:
1429 case ISD::MSCATTER:
1430 case ISD::VP_LOAD:
1431 case ISD::VP_STORE:
1432 case ISD::VP_GATHER:
1433 case ISD::VP_SCATTER:
1434 case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
1435 case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
1436 return true;
1437 default:
1438 return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
1439 }
1440 }
1441};
1442
1443/// This is an SDNode representing atomic operations.
1444class AtomicSDNode : public MemSDNode {
1445public:
1446 AtomicSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl, SDVTList VTL,
1447 EVT MemVT, MachineMemOperand *MMO)
1448 : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) {
1449 assert(((Opc != ISD::ATOMIC_LOAD && Opc != ISD::ATOMIC_STORE) ||(static_cast <bool> (((Opc != ISD::ATOMIC_LOAD &&
Opc != ISD::ATOMIC_STORE) || MMO->isAtomic()) && "then why are we using an AtomicSDNode?"
) ? void (0) : __assert_fail ("((Opc != ISD::ATOMIC_LOAD && Opc != ISD::ATOMIC_STORE) || MMO->isAtomic()) && \"then why are we using an AtomicSDNode?\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1450, __extension__
__PRETTY_FUNCTION__))
1450 MMO->isAtomic()) && "then why are we using an AtomicSDNode?")(static_cast <bool> (((Opc != ISD::ATOMIC_LOAD &&
Opc != ISD::ATOMIC_STORE) || MMO->isAtomic()) && "then why are we using an AtomicSDNode?"
) ? void (0) : __assert_fail ("((Opc != ISD::ATOMIC_LOAD && Opc != ISD::ATOMIC_STORE) || MMO->isAtomic()) && \"then why are we using an AtomicSDNode?\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1450, __extension__
__PRETTY_FUNCTION__))
;
1451 }
1452
1453 const SDValue &getBasePtr() const { return getOperand(1); }
1454 const SDValue &getVal() const { return getOperand(2); }
1455
1456 /// Returns true if this SDNode represents cmpxchg atomic operation, false
1457 /// otherwise.
1458 bool isCompareAndSwap() const {
1459 unsigned Op = getOpcode();
1460 return Op == ISD::ATOMIC_CMP_SWAP ||
1461 Op == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS;
1462 }
1463
1464 /// For cmpxchg atomic operations, return the atomic ordering requirements
1465 /// when store does not occur.
1466 AtomicOrdering getFailureOrdering() const {
1467 assert(isCompareAndSwap() && "Must be cmpxchg operation")(static_cast <bool> (isCompareAndSwap() && "Must be cmpxchg operation"
) ? void (0) : __assert_fail ("isCompareAndSwap() && \"Must be cmpxchg operation\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1467, __extension__
__PRETTY_FUNCTION__))
;
1468 return MMO->getFailureOrdering();
1469 }
1470
1471 // Methods to support isa and dyn_cast
1472 static bool classof(const SDNode *N) {
1473 return N->getOpcode() == ISD::ATOMIC_CMP_SWAP ||
1474 N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS ||
1475 N->getOpcode() == ISD::ATOMIC_SWAP ||
1476 N->getOpcode() == ISD::ATOMIC_LOAD_ADD ||
1477 N->getOpcode() == ISD::ATOMIC_LOAD_SUB ||
1478 N->getOpcode() == ISD::ATOMIC_LOAD_AND ||
1479 N->getOpcode() == ISD::ATOMIC_LOAD_CLR ||
1480 N->getOpcode() == ISD::ATOMIC_LOAD_OR ||
1481 N->getOpcode() == ISD::ATOMIC_LOAD_XOR ||
1482 N->getOpcode() == ISD::ATOMIC_LOAD_NAND ||
1483 N->getOpcode() == ISD::ATOMIC_LOAD_MIN ||
1484 N->getOpcode() == ISD::ATOMIC_LOAD_MAX ||
1485 N->getOpcode() == ISD::ATOMIC_LOAD_UMIN ||
1486 N->getOpcode() == ISD::ATOMIC_LOAD_UMAX ||
1487 N->getOpcode() == ISD::ATOMIC_LOAD_FADD ||
1488 N->getOpcode() == ISD::ATOMIC_LOAD_FSUB ||
1489 N->getOpcode() == ISD::ATOMIC_LOAD_FMAX ||
1490 N->getOpcode() == ISD::ATOMIC_LOAD_FMIN ||
1491 N->getOpcode() == ISD::ATOMIC_LOAD_UINC_WRAP ||
1492 N->getOpcode() == ISD::ATOMIC_LOAD_UDEC_WRAP ||
1493 N->getOpcode() == ISD::ATOMIC_LOAD ||
1494 N->getOpcode() == ISD::ATOMIC_STORE;
1495 }
1496};
1497
1498/// This SDNode is used for target intrinsics that touch
1499/// memory and need an associated MachineMemOperand. Its opcode may be
1500/// INTRINSIC_VOID, INTRINSIC_W_CHAIN, PREFETCH, or a target-specific opcode
1501/// with a value not less than FIRST_TARGET_MEMORY_OPCODE.
1502class MemIntrinsicSDNode : public MemSDNode {
1503public:
1504 MemIntrinsicSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
1505 SDVTList VTs, EVT MemoryVT, MachineMemOperand *MMO)
1506 : MemSDNode(Opc, Order, dl, VTs, MemoryVT, MMO) {
1507 SDNodeBits.IsMemIntrinsic = true;
1508 }
1509
1510 // Methods to support isa and dyn_cast
1511 static bool classof(const SDNode *N) {
1512 // We lower some target intrinsics to their target opcode
1513 // early a node with a target opcode can be of this class
1514 return N->isMemIntrinsic() ||
1515 N->getOpcode() == ISD::PREFETCH ||
1516 N->isTargetMemoryOpcode();
1517 }
1518};
1519
1520/// This SDNode is used to implement the code generator
1521/// support for the llvm IR shufflevector instruction. It combines elements
1522/// from two input vectors into a new input vector, with the selection and
1523/// ordering of elements determined by an array of integers, referred to as
1524/// the shuffle mask. For input vectors of width N, mask indices of 0..N-1
1525/// refer to elements from the LHS input, and indices from N to 2N-1 the RHS.
1526/// An index of -1 is treated as undef, such that the code generator may put
1527/// any value in the corresponding element of the result.
1528class ShuffleVectorSDNode : public SDNode {
1529 // The memory for Mask is owned by the SelectionDAG's OperandAllocator, and
1530 // is freed when the SelectionDAG object is destroyed.
1531 const int *Mask;
1532
1533protected:
1534 friend class SelectionDAG;
1535
1536 ShuffleVectorSDNode(EVT VT, unsigned Order, const DebugLoc &dl, const int *M)
1537 : SDNode(ISD::VECTOR_SHUFFLE, Order, dl, getSDVTList(VT)), Mask(M) {}
1538
1539public:
1540 ArrayRef<int> getMask() const {
1541 EVT VT = getValueType(0);
1542 return ArrayRef(Mask, VT.getVectorNumElements());
1543 }
1544
1545 int getMaskElt(unsigned Idx) const {
1546 assert(Idx < getValueType(0).getVectorNumElements() && "Idx out of range!")(static_cast <bool> (Idx < getValueType(0).getVectorNumElements
() && "Idx out of range!") ? void (0) : __assert_fail
("Idx < getValueType(0).getVectorNumElements() && \"Idx out of range!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1546, __extension__
__PRETTY_FUNCTION__))
;
1547 return Mask[Idx];
1548 }
1549
1550 bool isSplat() const { return isSplatMask(Mask, getValueType(0)); }
1551
1552 int getSplatIndex() const {
1553 assert(isSplat() && "Cannot get splat index for non-splat!")(static_cast <bool> (isSplat() && "Cannot get splat index for non-splat!"
) ? void (0) : __assert_fail ("isSplat() && \"Cannot get splat index for non-splat!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1553, __extension__
__PRETTY_FUNCTION__))
;
1554 EVT VT = getValueType(0);
1555 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
1556 if (Mask[i] >= 0)
1557 return Mask[i];
1558
1559 // We can choose any index value here and be correct because all elements
1560 // are undefined. Return 0 for better potential for callers to simplify.
1561 return 0;
1562 }
1563
1564 static bool isSplatMask(const int *Mask, EVT VT);
1565
1566 /// Change values in a shuffle permute mask assuming
1567 /// the two vector operands have swapped position.
1568 static void commuteMask(MutableArrayRef<int> Mask) {
1569 unsigned NumElems = Mask.size();
1570 for (unsigned i = 0; i != NumElems; ++i) {
1571 int idx = Mask[i];
1572 if (idx < 0)
1573 continue;
1574 else if (idx < (int)NumElems)
1575 Mask[i] = idx + NumElems;
1576 else
1577 Mask[i] = idx - NumElems;
1578 }
1579 }
1580
1581 static bool classof(const SDNode *N) {
1582 return N->getOpcode() == ISD::VECTOR_SHUFFLE;
1583 }
1584};
1585
1586class ConstantSDNode : public SDNode {
1587 friend class SelectionDAG;
1588
1589 const ConstantInt *Value;
1590
1591 ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val, EVT VT)
1592 : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DebugLoc(),
1593 getSDVTList(VT)),
1594 Value(val) {
1595 ConstantSDNodeBits.IsOpaque = isOpaque;
1596 }
1597
1598public:
1599 const ConstantInt *getConstantIntValue() const { return Value; }
1600 const APInt &getAPIntValue() const { return Value->getValue(); }
1601 uint64_t getZExtValue() const { return Value->getZExtValue(); }
1602 int64_t getSExtValue() const { return Value->getSExtValue(); }
1603 uint64_t getLimitedValue(uint64_t Limit = UINT64_MAX(18446744073709551615UL)) {
1604 return Value->getLimitedValue(Limit);
1605 }
1606 MaybeAlign getMaybeAlignValue() const { return Value->getMaybeAlignValue(); }
1607 Align getAlignValue() const { return Value->getAlignValue(); }
1608
1609 bool isOne() const { return Value->isOne(); }
1610 bool isZero() const { return Value->isZero(); }
1611 LLVM_DEPRECATED("use isZero instead", "isZero")__attribute__((deprecated("use isZero instead", "isZero")))
1612 bool isNullValue() const { return isZero(); }
1613 bool isAllOnes() const { return Value->isMinusOne(); }
1614 LLVM_DEPRECATED("use isAllOnes instead", "isAllOnes")__attribute__((deprecated("use isAllOnes instead", "isAllOnes"
)))
1615 bool isAllOnesValue() const { return isAllOnes(); }
1616 bool isMaxSignedValue() const { return Value->isMaxValue(true); }
1617 bool isMinSignedValue() const { return Value->isMinValue(true); }
1618
1619 bool isOpaque() const { return ConstantSDNodeBits.IsOpaque; }
1620
1621 static bool classof(const SDNode *N) {
1622 return N->getOpcode() == ISD::Constant ||
1623 N->getOpcode() == ISD::TargetConstant;
1624 }
1625};
1626
1627uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
1628 return cast<ConstantSDNode>(getOperand(Num))->getZExtValue();
1629}
1630
1631const APInt &SDNode::getConstantOperandAPInt(unsigned Num) const {
1632 return cast<ConstantSDNode>(getOperand(Num))->getAPIntValue();
1633}
1634
1635class ConstantFPSDNode : public SDNode {
1636 friend class SelectionDAG;
1637
1638 const ConstantFP *Value;
1639
1640 ConstantFPSDNode(bool isTarget, const ConstantFP *val, EVT VT)
1641 : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0,
1642 DebugLoc(), getSDVTList(VT)),
1643 Value(val) {}
1644
1645public:
1646 const APFloat& getValueAPF() const { return Value->getValueAPF(); }
1647 const ConstantFP *getConstantFPValue() const { return Value; }
1648
1649 /// Return true if the value is positive or negative zero.
1650 bool isZero() const { return Value->isZero(); }
1651
1652 /// Return true if the value is a NaN.
1653 bool isNaN() const { return Value->isNaN(); }
1654
1655 /// Return true if the value is an infinity
1656 bool isInfinity() const { return Value->isInfinity(); }
1657
1658 /// Return true if the value is negative.
1659 bool isNegative() const { return Value->isNegative(); }
1660
1661 /// We don't rely on operator== working on double values, as
1662 /// it returns true for things that are clearly not equal, like -0.0 and 0.0.
1663 /// As such, this method can be used to do an exact bit-for-bit comparison of
1664 /// two floating point values.
1665
1666 /// We leave the version with the double argument here because it's just so
1667 /// convenient to write "2.0" and the like. Without this function we'd
1668 /// have to duplicate its logic everywhere it's called.
1669 bool isExactlyValue(double V) const {
1670 return Value->getValueAPF().isExactlyValue(V);
1671 }
1672 bool isExactlyValue(const APFloat& V) const;
1673
1674 static bool isValueValidForType(EVT VT, const APFloat& Val);
1675
1676 static bool classof(const SDNode *N) {
1677 return N->getOpcode() == ISD::ConstantFP ||
1678 N->getOpcode() == ISD::TargetConstantFP;
1679 }
1680};
1681
1682/// Returns true if \p V is a constant integer zero.
1683bool isNullConstant(SDValue V);
1684
1685/// Returns true if \p V is an FP constant with a value of positive zero.
1686bool isNullFPConstant(SDValue V);
1687
1688/// Returns true if \p V is an integer constant with all bits set.
1689bool isAllOnesConstant(SDValue V);
1690
1691/// Returns true if \p V is a constant integer one.
1692bool isOneConstant(SDValue V);
1693
1694/// Returns true if \p V is a constant min signed integer value.
1695bool isMinSignedConstant(SDValue V);
1696
1697/// Returns true if \p V is a neutral element of Opc with Flags.
1698/// When OperandNo is 0, it checks that V is a left identity. Otherwise, it
1699/// checks that V is a right identity.
1700bool isNeutralConstant(unsigned Opc, SDNodeFlags Flags, SDValue V,
1701 unsigned OperandNo);
1702
1703/// Return the non-bitcasted source operand of \p V if it exists.
1704/// If \p V is not a bitcasted value, it is returned as-is.
1705SDValue peekThroughBitcasts(SDValue V);
1706
1707/// Return the non-bitcasted and one-use source operand of \p V if it exists.
1708/// If \p V is not a bitcasted one-use value, it is returned as-is.
1709SDValue peekThroughOneUseBitcasts(SDValue V);
1710
1711/// Return the non-extracted vector source operand of \p V if it exists.
1712/// If \p V is not an extracted subvector, it is returned as-is.
1713SDValue peekThroughExtractSubvectors(SDValue V);
1714
1715/// Return the non-truncated source operand of \p V if it exists.
1716/// If \p V is not a truncation, it is returned as-is.
1717SDValue peekThroughTruncates(SDValue V);
1718
1719/// Returns true if \p V is a bitwise not operation. Assumes that an all ones
1720/// constant is canonicalized to be operand 1.
1721bool isBitwiseNot(SDValue V, bool AllowUndefs = false);
1722
1723/// If \p V is a bitwise not, returns the inverted operand. Otherwise returns
1724/// an empty SDValue. Only bits set in \p Mask are required to be inverted,
1725/// other bits may be arbitrary.
1726SDValue getBitwiseNotOperand(SDValue V, SDValue Mask, bool AllowUndefs);
1727
1728/// Returns the SDNode if it is a constant splat BuildVector or constant int.
1729ConstantSDNode *isConstOrConstSplat(SDValue N, bool AllowUndefs = false,
1730 bool AllowTruncation = false);
1731
1732/// Returns the SDNode if it is a demanded constant splat BuildVector or
1733/// constant int.
1734ConstantSDNode *isConstOrConstSplat(SDValue N, const APInt &DemandedElts,
1735 bool AllowUndefs = false,
1736 bool AllowTruncation = false);
1737
1738/// Returns the SDNode if it is a constant splat BuildVector or constant float.
1739ConstantFPSDNode *isConstOrConstSplatFP(SDValue N, bool AllowUndefs = false);
1740
1741/// Returns the SDNode if it is a demanded constant splat BuildVector or
1742/// constant float.
1743ConstantFPSDNode *isConstOrConstSplatFP(SDValue N, const APInt &DemandedElts,
1744 bool AllowUndefs = false);
1745
1746/// Return true if the value is a constant 0 integer or a splatted vector of
1747/// a constant 0 integer (with no undefs by default).
1748/// Build vector implicit truncation is not an issue for null values.
1749bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false);
1750
1751/// Return true if the value is a constant 1 integer or a splatted vector of a
1752/// constant 1 integer (with no undefs).
1753/// Build vector implicit truncation is allowed, but the truncated bits need to
1754/// be zero.
1755bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false);
1756
1757/// Return true if the value is a constant -1 integer or a splatted vector of a
1758/// constant -1 integer (with no undefs).
1759/// Does not permit build vector implicit truncation.
1760bool isAllOnesOrAllOnesSplat(SDValue V, bool AllowUndefs = false);
1761
1762/// Return true if \p V is either a integer or FP constant.
1763inline bool isIntOrFPConstant(SDValue V) {
1764 return isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V);
1765}
1766
1767class GlobalAddressSDNode : public SDNode {
1768 friend class SelectionDAG;
1769
1770 const GlobalValue *TheGlobal;
1771 int64_t Offset;
1772 unsigned TargetFlags;
1773
1774 GlobalAddressSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL,
1775 const GlobalValue *GA, EVT VT, int64_t o,
1776 unsigned TF);
1777
1778public:
1779 const GlobalValue *getGlobal() const { return TheGlobal; }
1780 int64_t getOffset() const { return Offset; }
1781 unsigned getTargetFlags() const { return TargetFlags; }
1782 // Return the address space this GlobalAddress belongs to.
1783 unsigned getAddressSpace() const;
1784
1785 static bool classof(const SDNode *N) {
1786 return N->getOpcode() == ISD::GlobalAddress ||
1787 N->getOpcode() == ISD::TargetGlobalAddress ||
1788 N->getOpcode() == ISD::GlobalTLSAddress ||
1789 N->getOpcode() == ISD::TargetGlobalTLSAddress;
1790 }
1791};
1792
1793class FrameIndexSDNode : public SDNode {
1794 friend class SelectionDAG;
1795
1796 int FI;
1797
1798 FrameIndexSDNode(int fi, EVT VT, bool isTarg)
1799 : SDNode(isTarg ? ISD::TargetFrameIndex : ISD::FrameIndex,
1800 0, DebugLoc(), getSDVTList(VT)), FI(fi) {
1801 }
1802
1803public:
1804 int getIndex() const { return FI; }
1805
1806 static bool classof(const SDNode *N) {
1807 return N->getOpcode() == ISD::FrameIndex ||
1808 N->getOpcode() == ISD::TargetFrameIndex;
1809 }
1810};
1811
1812/// This SDNode is used for LIFETIME_START/LIFETIME_END values, which indicate
1813/// the offet and size that are started/ended in the underlying FrameIndex.
1814class LifetimeSDNode : public SDNode {
1815 friend class SelectionDAG;
1816 int64_t Size;
1817 int64_t Offset; // -1 if offset is unknown.
1818
1819 LifetimeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
1820 SDVTList VTs, int64_t Size, int64_t Offset)
1821 : SDNode(Opcode, Order, dl, VTs), Size(Size), Offset(Offset) {}
1822public:
1823 int64_t getFrameIndex() const {
1824 return cast<FrameIndexSDNode>(getOperand(1))->getIndex();
1825 }
1826
1827 bool hasOffset() const { return Offset >= 0; }
1828 int64_t getOffset() const {
1829 assert(hasOffset() && "offset is unknown")(static_cast <bool> (hasOffset() && "offset is unknown"
) ? void (0) : __assert_fail ("hasOffset() && \"offset is unknown\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1829, __extension__
__PRETTY_FUNCTION__))
;
1830 return Offset;
1831 }
1832 int64_t getSize() const {
1833 assert(hasOffset() && "offset is unknown")(static_cast <bool> (hasOffset() && "offset is unknown"
) ? void (0) : __assert_fail ("hasOffset() && \"offset is unknown\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1833, __extension__
__PRETTY_FUNCTION__))
;
1834 return Size;
1835 }
1836
1837 // Methods to support isa and dyn_cast
1838 static bool classof(const SDNode *N) {
1839 return N->getOpcode() == ISD::LIFETIME_START ||
1840 N->getOpcode() == ISD::LIFETIME_END;
1841 }
1842};
1843
1844/// This SDNode is used for PSEUDO_PROBE values, which are the function guid and
1845/// the index of the basic block being probed. A pseudo probe serves as a place
1846/// holder and will be removed at the end of compilation. It does not have any
1847/// operand because we do not want the instruction selection to deal with any.
1848class PseudoProbeSDNode : public SDNode {
1849 friend class SelectionDAG;
1850 uint64_t Guid;
1851 uint64_t Index;
1852 uint32_t Attributes;
1853
1854 PseudoProbeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &Dl,
1855 SDVTList VTs, uint64_t Guid, uint64_t Index, uint32_t Attr)
1856 : SDNode(Opcode, Order, Dl, VTs), Guid(Guid), Index(Index),
1857 Attributes(Attr) {}
1858
1859public:
1860 uint64_t getGuid() const { return Guid; }
1861 uint64_t getIndex() const { return Index; }
1862 uint32_t getAttributes() const { return Attributes; }
1863
1864 // Methods to support isa and dyn_cast
1865 static bool classof(const SDNode *N) {
1866 return N->getOpcode() == ISD::PSEUDO_PROBE;
1867 }
1868};
1869
1870class JumpTableSDNode : public SDNode {
1871 friend class SelectionDAG;
1872
1873 int JTI;
1874 unsigned TargetFlags;
1875
1876 JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned TF)
1877 : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable,
1878 0, DebugLoc(), getSDVTList(VT)), JTI(jti), TargetFlags(TF) {
1879 }
1880
1881public:
1882 int getIndex() const { return JTI; }
1883 unsigned getTargetFlags() const { return TargetFlags; }
1884
1885 static bool classof(const SDNode *N) {
1886 return N->getOpcode() == ISD::JumpTable ||
1887 N->getOpcode() == ISD::TargetJumpTable;
1888 }
1889};
1890
1891class ConstantPoolSDNode : public SDNode {
1892 friend class SelectionDAG;
1893
1894 union {
1895 const Constant *ConstVal;
1896 MachineConstantPoolValue *MachineCPVal;
1897 } Val;
1898 int Offset; // It's a MachineConstantPoolValue if top bit is set.
1899 Align Alignment; // Minimum alignment requirement of CP.
1900 unsigned TargetFlags;
1901
1902 ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o,
1903 Align Alignment, unsigned TF)
1904 : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
1905 DebugLoc(), getSDVTList(VT)),
1906 Offset(o), Alignment(Alignment), TargetFlags(TF) {
1907 assert(Offset >= 0 && "Offset is too large")(static_cast <bool> (Offset >= 0 && "Offset is too large"
) ? void (0) : __assert_fail ("Offset >= 0 && \"Offset is too large\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1907, __extension__
__PRETTY_FUNCTION__))
;
1908 Val.ConstVal = c;
1909 }
1910
1911 ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v, EVT VT, int o,
1912 Align Alignment, unsigned TF)
1913 : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
1914 DebugLoc(), getSDVTList(VT)),
1915 Offset(o), Alignment(Alignment), TargetFlags(TF) {
1916 assert(Offset >= 0 && "Offset is too large")(static_cast <bool> (Offset >= 0 && "Offset is too large"
) ? void (0) : __assert_fail ("Offset >= 0 && \"Offset is too large\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1916, __extension__
__PRETTY_FUNCTION__))
;
1917 Val.MachineCPVal = v;
1918 Offset |= 1 << (sizeof(unsigned)*CHAR_BIT8-1);
1919 }
1920
1921public:
1922 bool isMachineConstantPoolEntry() const {
1923 return Offset < 0;
1924 }
1925
1926 const Constant *getConstVal() const {
1927 assert(!isMachineConstantPoolEntry() && "Wrong constantpool type")(static_cast <bool> (!isMachineConstantPoolEntry() &&
"Wrong constantpool type") ? void (0) : __assert_fail ("!isMachineConstantPoolEntry() && \"Wrong constantpool type\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1927, __extension__
__PRETTY_FUNCTION__))
;
1928 return Val.ConstVal;
1929 }
1930
1931 MachineConstantPoolValue *getMachineCPVal() const {
1932 assert(isMachineConstantPoolEntry() && "Wrong constantpool type")(static_cast <bool> (isMachineConstantPoolEntry() &&
"Wrong constantpool type") ? void (0) : __assert_fail ("isMachineConstantPoolEntry() && \"Wrong constantpool type\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 1932, __extension__
__PRETTY_FUNCTION__))
;
1933 return Val.MachineCPVal;
1934 }
1935
1936 int getOffset() const {
1937 return Offset & ~(1 << (sizeof(unsigned)*CHAR_BIT8-1));
1938 }
1939
1940 // Return the alignment of this constant pool object, which is either 0 (for
1941 // default alignment) or the desired value.
1942 Align getAlign() const { return Alignment; }
1943 unsigned getTargetFlags() const { return TargetFlags; }
1944
1945 Type *getType() const;
1946
1947 static bool classof(const SDNode *N) {
1948 return N->getOpcode() == ISD::ConstantPool ||
1949 N->getOpcode() == ISD::TargetConstantPool;
1950 }
1951};
1952
1953/// Completely target-dependent object reference.
1954class TargetIndexSDNode : public SDNode {
1955 friend class SelectionDAG;
1956
1957 unsigned TargetFlags;
1958 int Index;
1959 int64_t Offset;
1960
1961public:
1962 TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned TF)
1963 : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)),
1964 TargetFlags(TF), Index(Idx), Offset(Ofs) {}
1965
1966 unsigned getTargetFlags() const { return TargetFlags; }
1967 int getIndex() const { return Index; }
1968 int64_t getOffset() const { return Offset; }
1969
1970 static bool classof(const SDNode *N) {
1971 return N->getOpcode() == ISD::TargetIndex;
1972 }
1973};
1974
1975class BasicBlockSDNode : public SDNode {
1976 friend class SelectionDAG;
1977
1978 MachineBasicBlock *MBB;
1979
1980 /// Debug info is meaningful and potentially useful here, but we create
1981 /// blocks out of order when they're jumped to, which makes it a bit
1982 /// harder. Let's see if we need it first.
1983 explicit BasicBlockSDNode(MachineBasicBlock *mbb)
1984 : SDNode(ISD::BasicBlock, 0, DebugLoc(), getSDVTList(MVT::Other)), MBB(mbb)
1985 {}
1986
1987public:
1988 MachineBasicBlock *getBasicBlock() const { return MBB; }
1989
1990 static bool classof(const SDNode *N) {
1991 return N->getOpcode() == ISD::BasicBlock;
1992 }
1993};
1994
1995/// A "pseudo-class" with methods for operating on BUILD_VECTORs.
1996class BuildVectorSDNode : public SDNode {
1997public:
1998 // These are constructed as SDNodes and then cast to BuildVectorSDNodes.
1999 explicit BuildVectorSDNode() = delete;
2000
2001 /// Check if this is a constant splat, and if so, find the
2002 /// smallest element size that splats the vector. If MinSplatBits is
2003 /// nonzero, the element size must be at least that large. Note that the
2004 /// splat element may be the entire vector (i.e., a one element vector).
2005 /// Returns the splat element value in SplatValue. Any undefined bits in
2006 /// that value are zero, and the corresponding bits in the SplatUndef mask
2007 /// are set. The SplatBitSize value is set to the splat element size in
2008 /// bits. HasAnyUndefs is set to true if any bits in the vector are
2009 /// undefined. isBigEndian describes the endianness of the target.
2010 bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
2011 unsigned &SplatBitSize, bool &HasAnyUndefs,
2012 unsigned MinSplatBits = 0,
2013 bool isBigEndian = false) const;
2014
2015 /// Returns the demanded splatted value or a null value if this is not a
2016 /// splat.
2017 ///
2018 /// The DemandedElts mask indicates the elements that must be in the splat.
2019 /// If passed a non-null UndefElements bitvector, it will resize it to match
2020 /// the vector width and set the bits where elements are undef.
2021 SDValue getSplatValue(const APInt &DemandedElts,
2022 BitVector *UndefElements = nullptr) const;
2023
2024 /// Returns the splatted value or a null value if this is not a splat.
2025 ///
2026 /// If passed a non-null UndefElements bitvector, it will resize it to match
2027 /// the vector width and set the bits where elements are undef.
2028 SDValue getSplatValue(BitVector *UndefElements = nullptr) const;
2029
2030 /// Find the shortest repeating sequence of values in the build vector.
2031 ///
2032 /// e.g. { u, X, u, X, u, u, X, u } -> { X }
2033 /// { X, Y, u, Y, u, u, X, u } -> { X, Y }
2034 ///
2035 /// Currently this must be a power-of-2 build vector.
2036 /// The DemandedElts mask indicates the elements that must be present,
2037 /// undemanded elements in Sequence may be null (SDValue()). If passed a
2038 /// non-null UndefElements bitvector, it will resize it to match the original
2039 /// vector width and set the bits where elements are undef. If result is
2040 /// false, Sequence will be empty.
2041 bool getRepeatedSequence(const APInt &DemandedElts,
2042 SmallVectorImpl<SDValue> &Sequence,
2043 BitVector *UndefElements = nullptr) const;
2044
2045 /// Find the shortest repeating sequence of values in the build vector.
2046 ///
2047 /// e.g. { u, X, u, X, u, u, X, u } -> { X }
2048 /// { X, Y, u, Y, u, u, X, u } -> { X, Y }
2049 ///
2050 /// Currently this must be a power-of-2 build vector.
2051 /// If passed a non-null UndefElements bitvector, it will resize it to match
2052 /// the original vector width and set the bits where elements are undef.
2053 /// If result is false, Sequence will be empty.
2054 bool getRepeatedSequence(SmallVectorImpl<SDValue> &Sequence,
2055 BitVector *UndefElements = nullptr) const;
2056
2057 /// Returns the demanded splatted constant or null if this is not a constant
2058 /// splat.
2059 ///
2060 /// The DemandedElts mask indicates the elements that must be in the splat.
2061 /// If passed a non-null UndefElements bitvector, it will resize it to match
2062 /// the vector width and set the bits where elements are undef.
2063 ConstantSDNode *
2064 getConstantSplatNode(const APInt &DemandedElts,
2065 BitVector *UndefElements = nullptr) const;
2066
2067 /// Returns the splatted constant or null if this is not a constant
2068 /// splat.
2069 ///
2070 /// If passed a non-null UndefElements bitvector, it will resize it to match
2071 /// the vector width and set the bits where elements are undef.
2072 ConstantSDNode *
2073 getConstantSplatNode(BitVector *UndefElements = nullptr) const;
2074
2075 /// Returns the demanded splatted constant FP or null if this is not a
2076 /// constant FP splat.
2077 ///
2078 /// The DemandedElts mask indicates the elements that must be in the splat.
2079 /// If passed a non-null UndefElements bitvector, it will resize it to match
2080 /// the vector width and set the bits where elements are undef.
2081 ConstantFPSDNode *
2082 getConstantFPSplatNode(const APInt &DemandedElts,
2083 BitVector *UndefElements = nullptr) const;
2084
2085 /// Returns the splatted constant FP or null if this is not a constant
2086 /// FP splat.
2087 ///
2088 /// If passed a non-null UndefElements bitvector, it will resize it to match
2089 /// the vector width and set the bits where elements are undef.
2090 ConstantFPSDNode *
2091 getConstantFPSplatNode(BitVector *UndefElements = nullptr) const;
2092
2093 /// If this is a constant FP splat and the splatted constant FP is an
2094 /// exact power or 2, return the log base 2 integer value. Otherwise,
2095 /// return -1.
2096 ///
2097 /// The BitWidth specifies the necessary bit precision.
2098 int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
2099 uint32_t BitWidth) const;
2100
2101 /// Extract the raw bit data from a build vector of Undef, Constant or
2102 /// ConstantFP node elements. Each raw bit element will be \p
2103 /// DstEltSizeInBits wide, undef elements are treated as zero, and entirely
2104 /// undefined elements are flagged in \p UndefElements.
2105 bool getConstantRawBits(bool IsLittleEndian, unsigned DstEltSizeInBits,
2106 SmallVectorImpl<APInt> &RawBitElements,
2107 BitVector &UndefElements) const;
2108
2109 bool isConstant() const;
2110
2111 /// If this BuildVector is constant and represents the numerical series
2112 /// "<a, a+n, a+2n, a+3n, ...>" where a is integer and n is a non-zero integer,
2113 /// the value "<a,n>" is returned.
2114 std::optional<std::pair<APInt, APInt>> isConstantSequence() const;
2115
2116 /// Recast bit data \p SrcBitElements to \p DstEltSizeInBits wide elements.
2117 /// Undef elements are treated as zero, and entirely undefined elements are
2118 /// flagged in \p DstUndefElements.
2119 static void recastRawBits(bool IsLittleEndian, unsigned DstEltSizeInBits,
2120 SmallVectorImpl<APInt> &DstBitElements,
2121 ArrayRef<APInt> SrcBitElements,
2122 BitVector &DstUndefElements,
2123 const BitVector &SrcUndefElements);
2124
2125 static bool classof(const SDNode *N) {
2126 return N->getOpcode() == ISD::BUILD_VECTOR;
2127 }
2128};
2129
2130/// An SDNode that holds an arbitrary LLVM IR Value. This is
2131/// used when the SelectionDAG needs to make a simple reference to something
2132/// in the LLVM IR representation.
2133///
2134class SrcValueSDNode : public SDNode {
2135 friend class SelectionDAG;
2136
2137 const Value *V;
2138
2139 /// Create a SrcValue for a general value.
2140 explicit SrcValueSDNode(const Value *v)
2141 : SDNode(ISD::SRCVALUE, 0, DebugLoc(), getSDVTList(MVT::Other)), V(v) {}
2142
2143public:
2144 /// Return the contained Value.
2145 const Value *getValue() const { return V; }
2146
2147 static bool classof(const SDNode *N) {
2148 return N->getOpcode() == ISD::SRCVALUE;
2149 }
2150};
2151
2152class MDNodeSDNode : public SDNode {
2153 friend class SelectionDAG;
2154
2155 const MDNode *MD;
2156
2157 explicit MDNodeSDNode(const MDNode *md)
2158 : SDNode(ISD::MDNODE_SDNODE, 0, DebugLoc(), getSDVTList(MVT::Other)), MD(md)
2159 {}
2160
2161public:
2162 const MDNode *getMD() const { return MD; }
2163
2164 static bool classof(const SDNode *N) {
2165 return N->getOpcode() == ISD::MDNODE_SDNODE;
2166 }
2167};
2168
2169class RegisterSDNode : public SDNode {
2170 friend class SelectionDAG;
2171
2172 Register Reg;
2173
2174 RegisterSDNode(Register reg, EVT VT)
2175 : SDNode(ISD::Register, 0, DebugLoc(), getSDVTList(VT)), Reg(reg) {}
2176
2177public:
2178 Register getReg() const { return Reg; }
2179
2180 static bool classof(const SDNode *N) {
2181 return N->getOpcode() == ISD::Register;
2182 }
2183};
2184
2185class RegisterMaskSDNode : public SDNode {
2186 friend class SelectionDAG;
2187
2188 // The memory for RegMask is not owned by the node.
2189 const uint32_t *RegMask;
2190
2191 RegisterMaskSDNode(const uint32_t *mask)
2192 : SDNode(ISD::RegisterMask, 0, DebugLoc(), getSDVTList(MVT::Untyped)),
2193 RegMask(mask) {}
2194
2195public:
2196 const uint32_t *getRegMask() const { return RegMask; }
2197
2198 static bool classof(const SDNode *N) {
2199 return N->getOpcode() == ISD::RegisterMask;
2200 }
2201};
2202
2203class BlockAddressSDNode : public SDNode {
2204 friend class SelectionDAG;
2205
2206 const BlockAddress *BA;
2207 int64_t Offset;
2208 unsigned TargetFlags;
2209
2210 BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
2211 int64_t o, unsigned Flags)
2212 : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)),
2213 BA(ba), Offset(o), TargetFlags(Flags) {}
2214
2215public:
2216 const BlockAddress *getBlockAddress() const { return BA; }
2217 int64_t getOffset() const { return Offset; }
2218 unsigned getTargetFlags() const { return TargetFlags; }
2219
2220 static bool classof(const SDNode *N) {
2221 return N->getOpcode() == ISD::BlockAddress ||
2222 N->getOpcode() == ISD::TargetBlockAddress;
2223 }
2224};
2225
2226class LabelSDNode : public SDNode {
2227 friend class SelectionDAG;
2228
2229 MCSymbol *Label;
2230
2231 LabelSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl, MCSymbol *L)
2232 : SDNode(Opcode, Order, dl, getSDVTList(MVT::Other)), Label(L) {
2233 assert(LabelSDNode::classof(this) && "not a label opcode")(static_cast <bool> (LabelSDNode::classof(this) &&
"not a label opcode") ? void (0) : __assert_fail ("LabelSDNode::classof(this) && \"not a label opcode\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2233, __extension__
__PRETTY_FUNCTION__))
;
2234 }
2235
2236public:
2237 MCSymbol *getLabel() const { return Label; }
2238
2239 static bool classof(const SDNode *N) {
2240 return N->getOpcode() == ISD::EH_LABEL ||
2241 N->getOpcode() == ISD::ANNOTATION_LABEL;
2242 }
2243};
2244
2245class ExternalSymbolSDNode : public SDNode {
2246 friend class SelectionDAG;
2247
2248 const char *Symbol;
2249 unsigned TargetFlags;
2250
2251 ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned TF, EVT VT)
2252 : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol, 0,
2253 DebugLoc(), getSDVTList(VT)),
2254 Symbol(Sym), TargetFlags(TF) {}
2255
2256public:
2257 const char *getSymbol() const { return Symbol; }
2258 unsigned getTargetFlags() const { return TargetFlags; }
2259
2260 static bool classof(const SDNode *N) {
2261 return N->getOpcode() == ISD::ExternalSymbol ||
2262 N->getOpcode() == ISD::TargetExternalSymbol;
2263 }
2264};
2265
2266class MCSymbolSDNode : public SDNode {
2267 friend class SelectionDAG;
2268
2269 MCSymbol *Symbol;
2270
2271 MCSymbolSDNode(MCSymbol *Symbol, EVT VT)
2272 : SDNode(ISD::MCSymbol, 0, DebugLoc(), getSDVTList(VT)), Symbol(Symbol) {}
2273
2274public:
2275 MCSymbol *getMCSymbol() const { return Symbol; }
2276
2277 static bool classof(const SDNode *N) {
2278 return N->getOpcode() == ISD::MCSymbol;
2279 }
2280};
2281
2282class CondCodeSDNode : public SDNode {
2283 friend class SelectionDAG;
2284
2285 ISD::CondCode Condition;
2286
2287 explicit CondCodeSDNode(ISD::CondCode Cond)
2288 : SDNode(ISD::CONDCODE, 0, DebugLoc(), getSDVTList(MVT::Other)),
2289 Condition(Cond) {}
2290
2291public:
2292 ISD::CondCode get() const { return Condition; }
2293
2294 static bool classof(const SDNode *N) {
2295 return N->getOpcode() == ISD::CONDCODE;
2296 }
2297};
2298
2299/// This class is used to represent EVT's, which are used
2300/// to parameterize some operations.
2301class VTSDNode : public SDNode {
2302 friend class SelectionDAG;
2303
2304 EVT ValueType;
2305
2306 explicit VTSDNode(EVT VT)
2307 : SDNode(ISD::VALUETYPE, 0, DebugLoc(), getSDVTList(MVT::Other)),
2308 ValueType(VT) {}
2309
2310public:
2311 EVT getVT() const { return ValueType; }
2312
2313 static bool classof(const SDNode *N) {
2314 return N->getOpcode() == ISD::VALUETYPE;
2315 }
2316};
2317
2318/// Base class for LoadSDNode and StoreSDNode
2319class LSBaseSDNode : public MemSDNode {
2320public:
2321 LSBaseSDNode(ISD::NodeType NodeTy, unsigned Order, const DebugLoc &dl,
2322 SDVTList VTs, ISD::MemIndexedMode AM, EVT MemVT,
2323 MachineMemOperand *MMO)
2324 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
2325 LSBaseSDNodeBits.AddressingMode = AM;
2326 assert(getAddressingMode() == AM && "Value truncated")(static_cast <bool> (getAddressingMode() == AM &&
"Value truncated") ? void (0) : __assert_fail ("getAddressingMode() == AM && \"Value truncated\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2326, __extension__
__PRETTY_FUNCTION__))
;
2327 }
2328
2329 const SDValue &getOffset() const {
2330 return getOperand(getOpcode() == ISD::LOAD ? 2 : 3);
2331 }
2332
2333 /// Return the addressing mode for this load or store:
2334 /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
2335 ISD::MemIndexedMode getAddressingMode() const {
2336 return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
2337 }
2338
2339 /// Return true if this is a pre/post inc/dec load/store.
2340 bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
2341
2342 /// Return true if this is NOT a pre/post inc/dec load/store.
2343 bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
2344
2345 static bool classof(const SDNode *N) {
2346 return N->getOpcode() == ISD::LOAD ||
2347 N->getOpcode() == ISD::STORE;
2348 }
2349};
2350
2351/// This class is used to represent ISD::LOAD nodes.
2352class LoadSDNode : public LSBaseSDNode {
2353 friend class SelectionDAG;
2354
2355 LoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2356 ISD::MemIndexedMode AM, ISD::LoadExtType ETy, EVT MemVT,
2357 MachineMemOperand *MMO)
2358 : LSBaseSDNode(ISD::LOAD, Order, dl, VTs, AM, MemVT, MMO) {
2359 LoadSDNodeBits.ExtTy = ETy;
2360 assert(readMem() && "Load MachineMemOperand is not a load!")(static_cast <bool> (readMem() && "Load MachineMemOperand is not a load!"
) ? void (0) : __assert_fail ("readMem() && \"Load MachineMemOperand is not a load!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2360, __extension__
__PRETTY_FUNCTION__))
;
2361 assert(!writeMem() && "Load MachineMemOperand is a store!")(static_cast <bool> (!writeMem() && "Load MachineMemOperand is a store!"
) ? void (0) : __assert_fail ("!writeMem() && \"Load MachineMemOperand is a store!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2361, __extension__
__PRETTY_FUNCTION__))
;
2362 }
2363
2364public:
2365 /// Return whether this is a plain node,
2366 /// or one of the varieties of value-extending loads.
2367 ISD::LoadExtType getExtensionType() const {
2368 return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
2369 }
2370
2371 const SDValue &getBasePtr() const { return getOperand(1); }
2372 const SDValue &getOffset() const { return getOperand(2); }
2373
2374 static bool classof(const SDNode *N) {
2375 return N->getOpcode() == ISD::LOAD;
2376 }
2377};
2378
2379/// This class is used to represent ISD::STORE nodes.
2380class StoreSDNode : public LSBaseSDNode {
2381 friend class SelectionDAG;
2382
2383 StoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2384 ISD::MemIndexedMode AM, bool isTrunc, EVT MemVT,
2385 MachineMemOperand *MMO)
2386 : LSBaseSDNode(ISD::STORE, Order, dl, VTs, AM, MemVT, MMO) {
2387 StoreSDNodeBits.IsTruncating = isTrunc;
2388 assert(!readMem() && "Store MachineMemOperand is a load!")(static_cast <bool> (!readMem() && "Store MachineMemOperand is a load!"
) ? void (0) : __assert_fail ("!readMem() && \"Store MachineMemOperand is a load!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2388, __extension__
__PRETTY_FUNCTION__))
;
2389 assert(writeMem() && "Store MachineMemOperand is not a store!")(static_cast <bool> (writeMem() && "Store MachineMemOperand is not a store!"
) ? void (0) : __assert_fail ("writeMem() && \"Store MachineMemOperand is not a store!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2389, __extension__
__PRETTY_FUNCTION__))
;
2390 }
2391
2392public:
2393 /// Return true if the op does a truncation before store.
2394 /// For integers this is the same as doing a TRUNCATE and storing the result.
2395 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2396 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2397 void setTruncatingStore(bool Truncating) {
2398 StoreSDNodeBits.IsTruncating = Truncating;
2399 }
2400
2401 const SDValue &getValue() const { return getOperand(1); }
2402 const SDValue &getBasePtr() const { return getOperand(2); }
2403 const SDValue &getOffset() const { return getOperand(3); }
2404
2405 static bool classof(const SDNode *N) {
2406 return N->getOpcode() == ISD::STORE;
2407 }
2408};
2409
2410/// This base class is used to represent VP_LOAD, VP_STORE,
2411/// EXPERIMENTAL_VP_STRIDED_LOAD and EXPERIMENTAL_VP_STRIDED_STORE nodes
2412class VPBaseLoadStoreSDNode : public MemSDNode {
2413public:
2414 friend class SelectionDAG;
2415
2416 VPBaseLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order,
2417 const DebugLoc &DL, SDVTList VTs,
2418 ISD::MemIndexedMode AM, EVT MemVT,
2419 MachineMemOperand *MMO)
2420 : MemSDNode(NodeTy, Order, DL, VTs, MemVT, MMO) {
2421 LSBaseSDNodeBits.AddressingMode = AM;
2422 assert(getAddressingMode() == AM && "Value truncated")(static_cast <bool> (getAddressingMode() == AM &&
"Value truncated") ? void (0) : __assert_fail ("getAddressingMode() == AM && \"Value truncated\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2422, __extension__
__PRETTY_FUNCTION__))
;
2423 }
2424
2425 // VPStridedStoreSDNode (Chain, Data, Ptr, Offset, Stride, Mask, EVL)
2426 // VPStoreSDNode (Chain, Data, Ptr, Offset, Mask, EVL)
2427 // VPStridedLoadSDNode (Chain, Ptr, Offset, Stride, Mask, EVL)
2428 // VPLoadSDNode (Chain, Ptr, Offset, Mask, EVL)
2429 // Mask is a vector of i1 elements;
2430 // the type of EVL is TLI.getVPExplicitVectorLengthTy().
2431 const SDValue &getOffset() const {
2432 return getOperand((getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD ||
2433 getOpcode() == ISD::VP_LOAD)
2434 ? 2
2435 : 3);
2436 }
2437 const SDValue &getBasePtr() const {
2438 return getOperand((getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD ||
2439 getOpcode() == ISD::VP_LOAD)
2440 ? 1
2441 : 2);
2442 }
2443 const SDValue &getMask() const {
2444 switch (getOpcode()) {
2445 default:
2446 llvm_unreachable("Invalid opcode")::llvm::llvm_unreachable_internal("Invalid opcode", "llvm/include/llvm/CodeGen/SelectionDAGNodes.h"
, 2446)
;
2447 case ISD::VP_LOAD:
2448 return getOperand(3);
2449 case ISD::VP_STORE:
2450 case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
2451 return getOperand(4);
2452 case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
2453 return getOperand(5);
2454 }
2455 }
2456 const SDValue &getVectorLength() const {
2457 switch (getOpcode()) {
2458 default:
2459 llvm_unreachable("Invalid opcode")::llvm::llvm_unreachable_internal("Invalid opcode", "llvm/include/llvm/CodeGen/SelectionDAGNodes.h"
, 2459)
;
2460 case ISD::VP_LOAD:
2461 return getOperand(4);
2462 case ISD::VP_STORE:
2463 case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
2464 return getOperand(5);
2465 case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
2466 return getOperand(6);
2467 }
2468 }
2469
2470 /// Return the addressing mode for this load or store:
2471 /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
2472 ISD::MemIndexedMode getAddressingMode() const {
2473 return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
2474 }
2475
2476 /// Return true if this is a pre/post inc/dec load/store.
2477 bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
2478
2479 /// Return true if this is NOT a pre/post inc/dec load/store.
2480 bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
2481
2482 static bool classof(const SDNode *N) {
2483 return N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD ||
2484 N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE ||
2485 N->getOpcode() == ISD::VP_LOAD || N->getOpcode() == ISD::VP_STORE;
2486 }
2487};
2488
2489/// This class is used to represent a VP_LOAD node
2490class VPLoadSDNode : public VPBaseLoadStoreSDNode {
2491public:
2492 friend class SelectionDAG;
2493
2494 VPLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2495 ISD::MemIndexedMode AM, ISD::LoadExtType ETy, bool isExpanding,
2496 EVT MemVT, MachineMemOperand *MMO)
2497 : VPBaseLoadStoreSDNode(ISD::VP_LOAD, Order, dl, VTs, AM, MemVT, MMO) {
2498 LoadSDNodeBits.ExtTy = ETy;
2499 LoadSDNodeBits.IsExpanding = isExpanding;
2500 }
2501
2502 ISD::LoadExtType getExtensionType() const {
2503 return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
2504 }
2505
2506 const SDValue &getBasePtr() const { return getOperand(1); }
2507 const SDValue &getOffset() const { return getOperand(2); }
2508 const SDValue &getMask() const { return getOperand(3); }
2509 const SDValue &getVectorLength() const { return getOperand(4); }
2510
2511 static bool classof(const SDNode *N) {
2512 return N->getOpcode() == ISD::VP_LOAD;
2513 }
2514 bool isExpandingLoad() const { return LoadSDNodeBits.IsExpanding; }
2515};
2516
2517/// This class is used to represent an EXPERIMENTAL_VP_STRIDED_LOAD node.
2518class VPStridedLoadSDNode : public VPBaseLoadStoreSDNode {
2519public:
2520 friend class SelectionDAG;
2521
2522 VPStridedLoadSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
2523 ISD::MemIndexedMode AM, ISD::LoadExtType ETy,
2524 bool IsExpanding, EVT MemVT, MachineMemOperand *MMO)
2525 : VPBaseLoadStoreSDNode(ISD::EXPERIMENTAL_VP_STRIDED_LOAD, Order, DL, VTs,
2526 AM, MemVT, MMO) {
2527 LoadSDNodeBits.ExtTy = ETy;
2528 LoadSDNodeBits.IsExpanding = IsExpanding;
2529 }
2530
2531 ISD::LoadExtType getExtensionType() const {
2532 return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
2533 }
2534
2535 const SDValue &getBasePtr() const { return getOperand(1); }
2536 const SDValue &getOffset() const { return getOperand(2); }
2537 const SDValue &getStride() const { return getOperand(3); }
2538 const SDValue &getMask() const { return getOperand(4); }
2539 const SDValue &getVectorLength() const { return getOperand(5); }
2540
2541 static bool classof(const SDNode *N) {
2542 return N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD;
2543 }
2544 bool isExpandingLoad() const { return LoadSDNodeBits.IsExpanding; }
2545};
2546
2547/// This class is used to represent a VP_STORE node
2548class VPStoreSDNode : public VPBaseLoadStoreSDNode {
2549public:
2550 friend class SelectionDAG;
2551
2552 VPStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2553 ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing,
2554 EVT MemVT, MachineMemOperand *MMO)
2555 : VPBaseLoadStoreSDNode(ISD::VP_STORE, Order, dl, VTs, AM, MemVT, MMO) {
2556 StoreSDNodeBits.IsTruncating = isTrunc;
2557 StoreSDNodeBits.IsCompressing = isCompressing;
2558 }
2559
2560 /// Return true if this is a truncating store.
2561 /// For integers this is the same as doing a TRUNCATE and storing the result.
2562 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2563 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2564
2565 /// Returns true if the op does a compression to the vector before storing.
2566 /// The node contiguously stores the active elements (integers or floats)
2567 /// in src (those with their respective bit set in writemask k) to unaligned
2568 /// memory at base_addr.
2569 bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
2570
2571 const SDValue &getValue() const { return getOperand(1); }
2572 const SDValue &getBasePtr() const { return getOperand(2); }
2573 const SDValue &getOffset() const { return getOperand(3); }
2574 const SDValue &getMask() const { return getOperand(4); }
2575 const SDValue &getVectorLength() const { return getOperand(5); }
2576
2577 static bool classof(const SDNode *N) {
2578 return N->getOpcode() == ISD::VP_STORE;
2579 }
2580};
2581
2582/// This class is used to represent an EXPERIMENTAL_VP_STRIDED_STORE node.
2583class VPStridedStoreSDNode : public VPBaseLoadStoreSDNode {
2584public:
2585 friend class SelectionDAG;
2586
2587 VPStridedStoreSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
2588 ISD::MemIndexedMode AM, bool IsTrunc, bool IsCompressing,
2589 EVT MemVT, MachineMemOperand *MMO)
2590 : VPBaseLoadStoreSDNode(ISD::EXPERIMENTAL_VP_STRIDED_STORE, Order, DL,
2591 VTs, AM, MemVT, MMO) {
2592 StoreSDNodeBits.IsTruncating = IsTrunc;
2593 StoreSDNodeBits.IsCompressing = IsCompressing;
2594 }
2595
2596 /// Return true if this is a truncating store.
2597 /// For integers this is the same as doing a TRUNCATE and storing the result.
2598 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2599 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2600
2601 /// Returns true if the op does a compression to the vector before storing.
2602 /// The node contiguously stores the active elements (integers or floats)
2603 /// in src (those with their respective bit set in writemask k) to unaligned
2604 /// memory at base_addr.
2605 bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
2606
2607 const SDValue &getValue() const { return getOperand(1); }
2608 const SDValue &getBasePtr() const { return getOperand(2); }
2609 const SDValue &getOffset() const { return getOperand(3); }
2610 const SDValue &getStride() const { return getOperand(4); }
2611 const SDValue &getMask() const { return getOperand(5); }
2612 const SDValue &getVectorLength() const { return getOperand(6); }
2613
2614 static bool classof(const SDNode *N) {
2615 return N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE;
2616 }
2617};
2618
2619/// This base class is used to represent MLOAD and MSTORE nodes
2620class MaskedLoadStoreSDNode : public MemSDNode {
2621public:
2622 friend class SelectionDAG;
2623
2624 MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order,
2625 const DebugLoc &dl, SDVTList VTs,
2626 ISD::MemIndexedMode AM, EVT MemVT,
2627 MachineMemOperand *MMO)
2628 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
2629 LSBaseSDNodeBits.AddressingMode = AM;
2630 assert(getAddressingMode() == AM && "Value truncated")(static_cast <bool> (getAddressingMode() == AM &&
"Value truncated") ? void (0) : __assert_fail ("getAddressingMode() == AM && \"Value truncated\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2630, __extension__
__PRETTY_FUNCTION__))
;
2631 }
2632
2633 // MaskedLoadSDNode (Chain, ptr, offset, mask, passthru)
2634 // MaskedStoreSDNode (Chain, data, ptr, offset, mask)
2635 // Mask is a vector of i1 elements
2636 const SDValue &getOffset() const {
2637 return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3);
2638 }
2639 const SDValue &getMask() const {
2640 return getOperand(getOpcode() == ISD::MLOAD ? 3 : 4);
2641 }
2642
2643 /// Return the addressing mode for this load or store:
2644 /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
2645 ISD::MemIndexedMode getAddressingMode() const {
2646 return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
2647 }
2648
2649 /// Return true if this is a pre/post inc/dec load/store.
2650 bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
2651
2652 /// Return true if this is NOT a pre/post inc/dec load/store.
2653 bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
2654
2655 static bool classof(const SDNode *N) {
2656 return N->getOpcode() == ISD::MLOAD ||
2657 N->getOpcode() == ISD::MSTORE;
2658 }
2659};
2660
2661/// This class is used to represent an MLOAD node
2662class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
2663public:
2664 friend class SelectionDAG;
2665
2666 MaskedLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2667 ISD::MemIndexedMode AM, ISD::LoadExtType ETy,
2668 bool IsExpanding, EVT MemVT, MachineMemOperand *MMO)
2669 : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, AM, MemVT, MMO) {
2670 LoadSDNodeBits.ExtTy = ETy;
2671 LoadSDNodeBits.IsExpanding = IsExpanding;
2672 }
2673
2674 ISD::LoadExtType getExtensionType() const {
2675 return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
2676 }
2677
2678 const SDValue &getBasePtr() const { return getOperand(1); }
2679 const SDValue &getOffset() const { return getOperand(2); }
2680 const SDValue &getMask() const { return getOperand(3); }
2681 const SDValue &getPassThru() const { return getOperand(4); }
2682
2683 static bool classof(const SDNode *N) {
2684 return N->getOpcode() == ISD::MLOAD;
2685 }
2686
2687 bool isExpandingLoad() const { return LoadSDNodeBits.IsExpanding; }
2688};
2689
2690/// This class is used to represent an MSTORE node
2691class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
2692public:
2693 friend class SelectionDAG;
2694
2695 MaskedStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2696 ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing,
2697 EVT MemVT, MachineMemOperand *MMO)
2698 : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, AM, MemVT, MMO) {
2699 StoreSDNodeBits.IsTruncating = isTrunc;
2700 StoreSDNodeBits.IsCompressing = isCompressing;
2701 }
2702
2703 /// Return true if the op does a truncation before store.
2704 /// For integers this is the same as doing a TRUNCATE and storing the result.
2705 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2706 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2707
2708 /// Returns true if the op does a compression to the vector before storing.
2709 /// The node contiguously stores the active elements (integers or floats)
2710 /// in src (those with their respective bit set in writemask k) to unaligned
2711 /// memory at base_addr.
2712 bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
2713
2714 const SDValue &getValue() const { return getOperand(1); }
2715 const SDValue &getBasePtr() const { return getOperand(2); }
2716 const SDValue &getOffset() const { return getOperand(3); }
2717 const SDValue &getMask() const { return getOperand(4); }
2718
2719 static bool classof(const SDNode *N) {
2720 return N->getOpcode() == ISD::MSTORE;
2721 }
2722};
2723
2724/// This is a base class used to represent
2725/// VP_GATHER and VP_SCATTER nodes
2726///
2727class VPGatherScatterSDNode : public MemSDNode {
2728public:
2729 friend class SelectionDAG;
2730
2731 VPGatherScatterSDNode(ISD::NodeType NodeTy, unsigned Order,
2732 const DebugLoc &dl, SDVTList VTs, EVT MemVT,
2733 MachineMemOperand *MMO, ISD::MemIndexType IndexType)
2734 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
2735 LSBaseSDNodeBits.AddressingMode = IndexType;
2736 assert(getIndexType() == IndexType && "Value truncated")(static_cast <bool> (getIndexType() == IndexType &&
"Value truncated") ? void (0) : __assert_fail ("getIndexType() == IndexType && \"Value truncated\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2736, __extension__
__PRETTY_FUNCTION__))
;
2737 }
2738
2739 /// How is Index applied to BasePtr when computing addresses.
2740 ISD::MemIndexType getIndexType() const {
2741 return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
2742 }
2743 bool isIndexScaled() const {
2744 return !cast<ConstantSDNode>(getScale())->isOne();
2745 }
2746 bool isIndexSigned() const { return isIndexTypeSigned(getIndexType()); }
2747
2748 // In the both nodes address is Op1, mask is Op2:
2749 // VPGatherSDNode (Chain, base, index, scale, mask, vlen)
2750 // VPScatterSDNode (Chain, value, base, index, scale, mask, vlen)
2751 // Mask is a vector of i1 elements
2752 const SDValue &getBasePtr() const {
2753 return getOperand((getOpcode() == ISD::VP_GATHER) ? 1 : 2);
2754 }
2755 const SDValue &getIndex() const {
2756 return getOperand((getOpcode() == ISD::VP_GATHER) ? 2 : 3);
2757 }
2758 const SDValue &getScale() const {
2759 return getOperand((getOpcode() == ISD::VP_GATHER) ? 3 : 4);
2760 }
2761 const SDValue &getMask() const {
2762 return getOperand((getOpcode() == ISD::VP_GATHER) ? 4 : 5);
2763 }
2764 const SDValue &getVectorLength() const {
2765 return getOperand((getOpcode() == ISD::VP_GATHER) ? 5 : 6);
2766 }
2767
2768 static bool classof(const SDNode *N) {
2769 return N->getOpcode() == ISD::VP_GATHER ||
2770 N->getOpcode() == ISD::VP_SCATTER;
2771 }
2772};
2773
2774/// This class is used to represent an VP_GATHER node
2775///
2776class VPGatherSDNode : public VPGatherScatterSDNode {
2777public:
2778 friend class SelectionDAG;
2779
2780 VPGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT,
2781 MachineMemOperand *MMO, ISD::MemIndexType IndexType)
2782 : VPGatherScatterSDNode(ISD::VP_GATHER, Order, dl, VTs, MemVT, MMO,
2783 IndexType) {}
2784
2785 static bool classof(const SDNode *N) {
2786 return N->getOpcode() == ISD::VP_GATHER;
2787 }
2788};
2789
2790/// This class is used to represent an VP_SCATTER node
2791///
2792class VPScatterSDNode : public VPGatherScatterSDNode {
2793public:
2794 friend class SelectionDAG;
2795
2796 VPScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT,
2797 MachineMemOperand *MMO, ISD::MemIndexType IndexType)
2798 : VPGatherScatterSDNode(ISD::VP_SCATTER, Order, dl, VTs, MemVT, MMO,
2799 IndexType) {}
2800
2801 const SDValue &getValue() const { return getOperand(1); }
2802
2803 static bool classof(const SDNode *N) {
2804 return N->getOpcode() == ISD::VP_SCATTER;
2805 }
2806};
2807
2808/// This is a base class used to represent
2809/// MGATHER and MSCATTER nodes
2810///
2811class MaskedGatherScatterSDNode : public MemSDNode {
2812public:
2813 friend class SelectionDAG;
2814
2815 MaskedGatherScatterSDNode(ISD::NodeType NodeTy, unsigned Order,
2816 const DebugLoc &dl, SDVTList VTs, EVT MemVT,
2817 MachineMemOperand *MMO, ISD::MemIndexType IndexType)
2818 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
2819 LSBaseSDNodeBits.AddressingMode = IndexType;
2820 assert(getIndexType() == IndexType && "Value truncated")(static_cast <bool> (getIndexType() == IndexType &&
"Value truncated") ? void (0) : __assert_fail ("getIndexType() == IndexType && \"Value truncated\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 2820, __extension__
__PRETTY_FUNCTION__))
;
2821 }
2822
2823 /// How is Index applied to BasePtr when computing addresses.
2824 ISD::MemIndexType getIndexType() const {
2825 return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
2826 }
2827 bool isIndexScaled() const {
2828 return !cast<ConstantSDNode>(getScale())->isOne();
2829 }
2830 bool isIndexSigned() const { return isIndexTypeSigned(getIndexType()); }
2831
2832 // In the both nodes address is Op1, mask is Op2:
2833 // MaskedGatherSDNode (Chain, passthru, mask, base, index, scale)
2834 // MaskedScatterSDNode (Chain, value, mask, base, index, scale)
2835 // Mask is a vector of i1 elements
2836 const SDValue &getBasePtr() const { return getOperand(3); }
2837 const SDValue &getIndex() const { return getOperand(4); }
2838 const SDValue &getMask() const { return getOperand(2); }
2839 const SDValue &getScale() const { return getOperand(5); }
2840
2841 static bool classof(const SDNode *N) {
2842 return N->getOpcode() == ISD::MGATHER ||
2843 N->getOpcode() == ISD::MSCATTER;
2844 }
2845};
2846
2847/// This class is used to represent an MGATHER node
2848///
2849class MaskedGatherSDNode : public MaskedGatherScatterSDNode {
2850public:
2851 friend class SelectionDAG;
2852
2853 MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2854 EVT MemVT, MachineMemOperand *MMO,
2855 ISD::MemIndexType IndexType, ISD::LoadExtType ETy)
2856 : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, VTs, MemVT, MMO,
2857 IndexType) {
2858 LoadSDNodeBits.ExtTy = ETy;
2859 }
2860
2861 const SDValue &getPassThru() const { return getOperand(1); }
2862
2863 ISD::LoadExtType getExtensionType() const {
2864 return ISD::LoadExtType(LoadSDNodeBits.ExtTy);
2865 }
2866
2867 static bool classof(const SDNode *N) {
2868 return N->getOpcode() == ISD::MGATHER;
2869 }
2870};
2871
2872/// This class is used to represent an MSCATTER node
2873///
2874class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
2875public:
2876 friend class SelectionDAG;
2877
2878 MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2879 EVT MemVT, MachineMemOperand *MMO,
2880 ISD::MemIndexType IndexType, bool IsTrunc)
2881 : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, VTs, MemVT, MMO,
2882 IndexType) {
2883 StoreSDNodeBits.IsTruncating = IsTrunc;
2884 }
2885
2886 /// Return true if the op does a truncation before store.
2887 /// For integers this is the same as doing a TRUNCATE and storing the result.
2888 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2889 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2890
2891 const SDValue &getValue() const { return getOperand(1); }
2892
2893 static bool classof(const SDNode *N) {
2894 return N->getOpcode() == ISD::MSCATTER;
2895 }
2896};
2897
2898/// An SDNode that represents everything that will be needed
2899/// to construct a MachineInstr. These nodes are created during the
2900/// instruction selection proper phase.
2901///
2902/// Note that the only supported way to set the `memoperands` is by calling the
2903/// `SelectionDAG::setNodeMemRefs` function as the memory management happens
2904/// inside the DAG rather than in the node.
2905class MachineSDNode : public SDNode {
2906private:
2907 friend class SelectionDAG;
2908
2909 MachineSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL, SDVTList VTs)
2910 : SDNode(Opc, Order, DL, VTs) {}
2911
2912 // We use a pointer union between a single `MachineMemOperand` pointer and
2913 // a pointer to an array of `MachineMemOperand` pointers. This is null when
2914 // the number of these is zero, the single pointer variant used when the
2915 // number is one, and the array is used for larger numbers.
2916 //
2917 // The array is allocated via the `SelectionDAG`'s allocator and so will
2918 // always live until the DAG is cleaned up and doesn't require ownership here.
2919 //
2920 // We can't use something simpler like `TinyPtrVector` here because `SDNode`
2921 // subclasses aren't managed in a conforming C++ manner. See the comments on
2922 // `SelectionDAG::MorphNodeTo` which details what all goes on, but the
2923 // constraint here is that these don't manage memory with their constructor or
2924 // destructor and can be initialized to a good state even if they start off
2925 // uninitialized.
2926 PointerUnion<MachineMemOperand *, MachineMemOperand **> MemRefs = {};
2927
2928 // Note that this could be folded into the above `MemRefs` member if doing so
2929 // is advantageous at some point. We don't need to store this in most cases.
2930 // However, at the moment this doesn't appear to make the allocation any
2931 // smaller and makes the code somewhat simpler to read.
2932 int NumMemRefs = 0;
2933
2934public:
2935 using mmo_iterator = ArrayRef<MachineMemOperand *>::const_iterator;
2936
2937 ArrayRef<MachineMemOperand *> memoperands() const {
2938 // Special case the common cases.
2939 if (NumMemRefs == 0)
2940 return {};
2941 if (NumMemRefs == 1)
2942 return ArrayRef(MemRefs.getAddrOfPtr1(), 1);
2943
2944 // Otherwise we have an actual array.
2945 return ArrayRef(cast<MachineMemOperand **>(MemRefs), NumMemRefs);
2946 }
2947 mmo_iterator memoperands_begin() const { return memoperands().begin(); }
2948 mmo_iterator memoperands_end() const { return memoperands().end(); }
2949 bool memoperands_empty() const { return memoperands().empty(); }
2950
2951 /// Clear out the memory reference descriptor list.
2952 void clearMemRefs() {
2953 MemRefs = nullptr;
2954 NumMemRefs = 0;
2955 }
2956
2957 static bool classof(const SDNode *N) {
2958 return N->isMachineOpcode();
2959 }
2960};
2961
2962/// An SDNode that records if a register contains a value that is guaranteed to
2963/// be aligned accordingly.
2964class AssertAlignSDNode : public SDNode {
2965 Align Alignment;
2966
2967public:
2968 AssertAlignSDNode(unsigned Order, const DebugLoc &DL, EVT VT, Align A)
2969 : SDNode(ISD::AssertAlign, Order, DL, getSDVTList(VT)), Alignment(A) {}
2970
2971 Align getAlign() const { return Alignment; }
2972
2973 static bool classof(const SDNode *N) {
2974 return N->getOpcode() == ISD::AssertAlign;
2975 }
2976};
2977
2978class SDNodeIterator {
2979 const SDNode *Node;
2980 unsigned Operand;
2981
2982 SDNodeIterator(const SDNode *N, unsigned Op) : Node(N), Operand(Op) {}
2983
2984public:
2985 using iterator_category = std::forward_iterator_tag;
2986 using value_type = SDNode;
2987 using difference_type = std::ptrdiff_t;
2988 using pointer = value_type *;
2989 using reference = value_type &;
2990
2991 bool operator==(const SDNodeIterator& x) const {
2992 return Operand == x.Operand;
2993 }
2994 bool operator!=(const SDNodeIterator& x) const { return !operator==(x); }
2995
2996 pointer operator*() const {
2997 return Node->getOperand(Operand).getNode();
2998 }
2999 pointer operator->() const { return operator*(); }
3000
3001 SDNodeIterator& operator++() { // Preincrement
3002 ++Operand;
3003 return *this;
3004 }
3005 SDNodeIterator operator++(int) { // Postincrement
3006 SDNodeIterator tmp = *this; ++*this; return tmp;
3007 }
3008 size_t operator-(SDNodeIterator Other) const {
3009 assert(Node == Other.Node &&(static_cast <bool> (Node == Other.Node && "Cannot compare iterators of two different nodes!"
) ? void (0) : __assert_fail ("Node == Other.Node && \"Cannot compare iterators of two different nodes!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 3010, __extension__
__PRETTY_FUNCTION__))
3010 "Cannot compare iterators of two different nodes!")(static_cast <bool> (Node == Other.Node && "Cannot compare iterators of two different nodes!"
) ? void (0) : __assert_fail ("Node == Other.Node && \"Cannot compare iterators of two different nodes!\""
, "llvm/include/llvm/CodeGen/SelectionDAGNodes.h", 3010, __extension__
__PRETTY_FUNCTION__))
;
3011 return Operand - Other.Operand;
3012 }
3013
3014 static SDNodeIterator begin(const SDNode *N) { return SDNodeIterator(N, 0); }
3015 static SDNodeIterator end (const SDNode *N) {
3016 return SDNodeIterator(N, N->getNumOperands());
3017 }
3018
3019 unsigned getOperand() const { return Operand; }
3020 const SDNode *getNode() const { return Node; }
3021};
3022
3023template <> struct GraphTraits<SDNode*> {
3024 using NodeRef = SDNode *;
3025 using ChildIteratorType = SDNodeIterator;
3026
3027 static NodeRef getEntryNode(SDNode *N) { return N; }
3028
3029 static ChildIteratorType child_begin(NodeRef N) {
3030 return SDNodeIterator::begin(N);
3031 }
3032
3033 static ChildIteratorType child_end(NodeRef N) {
3034 return SDNodeIterator::end(N);
3035 }
3036};
3037
3038/// A representation of the largest SDNode, for use in sizeof().
3039///
3040/// This needs to be a union because the largest node differs on 32 bit systems
3041/// with 4 and 8 byte pointer alignment, respectively.
3042using LargestSDNode = AlignedCharArrayUnion<AtomicSDNode, TargetIndexSDNode,
3043 BlockAddressSDNode,
3044 GlobalAddressSDNode,
3045 PseudoProbeSDNode>;
3046
3047/// The SDNode class with the greatest alignment requirement.
3048using MostAlignedSDNode = GlobalAddressSDNode;
3049
3050namespace ISD {
3051
3052 /// Returns true if the specified node is a non-extending and unindexed load.
3053 inline bool isNormalLoad(const SDNode *N) {
3054 const LoadSDNode *Ld = dyn_cast<LoadSDNode>(N);
3055 return Ld && Ld->getExtensionType() == ISD::NON_EXTLOAD &&
3056 Ld->getAddressingMode() == ISD::UNINDEXED;
3057 }
3058
3059 /// Returns true if the specified node is a non-extending load.
3060 inline bool isNON_EXTLoad(const SDNode *N) {
3061 return isa<LoadSDNode>(N) &&
3062 cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
3063 }
3064
3065 /// Returns true if the specified node is a EXTLOAD.
3066 inline bool isEXTLoad(const SDNode *N) {
3067 return isa<LoadSDNode>(N) &&
3068 cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
3069 }
3070
3071 /// Returns true if the specified node is a SEXTLOAD.
3072 inline bool isSEXTLoad(const SDNode *N) {
3073 return isa<LoadSDNode>(N) &&
3074 cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
3075 }
3076
3077 /// Returns true if the specified node is a ZEXTLOAD.
3078 inline bool isZEXTLoad(const SDNode *N) {
3079 return isa<LoadSDNode>(N) &&
3080 cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
3081 }
3082
3083 /// Returns true if the specified node is an unindexed load.
3084 inline bool isUNINDEXEDLoad(const SDNode *N) {
3085 return isa<LoadSDNode>(N) &&
3086 cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
3087 }
3088
3089 /// Returns true if the specified node is a non-truncating
3090 /// and unindexed store.
3091 inline bool isNormalStore(const SDNode *N) {
3092 const StoreSDNode *St = dyn_cast<StoreSDNode>(N);
3093 return St && !St->isTruncatingStore() &&
3094 St->getAddressingMode() == ISD::UNINDEXED;
3095 }
3096
3097 /// Returns true if the specified node is an unindexed store.
3098 inline bool isUNINDEXEDStore(const SDNode *N) {
3099 return isa<StoreSDNode>(N) &&
3100 cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
3101 }
3102
3103 /// Attempt to match a unary predicate against a scalar/splat constant or
3104 /// every element of a constant BUILD_VECTOR.
3105 /// If AllowUndef is true, then UNDEF elements will pass nullptr to Match.
3106 bool matchUnaryPredicate(SDValue Op,
3107 std::function<bool(ConstantSDNode *)> Match,
3108 bool AllowUndefs = false);
3109
3110 /// Attempt to match a binary predicate against a pair of scalar/splat
3111 /// constants or every element of a pair of constant BUILD_VECTORs.
3112 /// If AllowUndef is true, then UNDEF elements will pass nullptr to Match.
3113 /// If AllowTypeMismatch is true then RetType + ArgTypes don't need to match.
3114 bool matchBinaryPredicate(
3115 SDValue LHS, SDValue RHS,
3116 std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match,
3117 bool AllowUndefs = false, bool AllowTypeMismatch = false);
3118
3119 /// Returns true if the specified value is the overflow result from one
3120 /// of the overflow intrinsic nodes.
3121 inline bool isOverflowIntrOpRes(SDValue Op) {
3122 unsigned Opc = Op.getOpcode();
3123 return (Op.getResNo() == 1 &&
3124 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3125 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
3126 }
3127
3128} // end namespace ISD
3129
3130} // end namespace llvm
3131
3132#endif // LLVM_CODEGEN_SELECTIONDAGNODES_H